Date: (Wed) Nov 04, 2015
Data: Source: Training: https://inclass.kaggle.com/c/15-071x-the-analytics-edge-summer-2015/download/eBayiPadTrain.csv
New: https://inclass.kaggle.com/c/15-071x-the-analytics-edge-summer-2015/download/eBayiPadTest.csv
Time period:
Based on analysis utilizing <> techniques,
Regression results: First run:
Classification results: template: prdline.my == “Unknown” -> 296 Low.cor.X.glm: Leaderboard: 0.83458 -> Rank 288 / 1884 0.85514 newobs_tbl=[N=471, Y=327]; submit_filename=template_Final_glm_submit.csv OOB_conf_mtrx=[YN=125, NY=76]=201; max.Accuracy.OOB=0.7710; opt.prob.threshold.OOB=0.6 startprice=100.00; biddable=95.42; productline=49.22; D.T.like=29.75; D.T.use=26.32; D.T.box=21.53;
prdline: -> Worse than template prdline.my == “Unknown” -> 285 All.X.no.rnorm.rf: Leaderboard: 0.82649 newobs_tbl=[N=485, Y=313]; submit_filename=prdline_Final_rf_submit.csv OOB_conf_mtrx=[YN=119, NY=80]=199; max.Accuracy.OOB=0.8339; opt.prob.threshold.OOB=0.5 startprice=100.00; biddable=84.25; D.sum.TfIdf=7.28; D.T.use=4.26; D.T.veri=2.78; D.T.scratch=1.99; D.T.box=; D.T.like=; Low.cor.X.glm: Leaderboard: 0.81234 newobs_tbl=[N=471, Y=327]; submit_filename=prdline_Low_cor_X_glm_submit.csv OOB_conf_mtrx=[YN=125, NY=74]=199; max.Accuracy.OOB=0.8205; opt.prob.threshold.OOB=0.6 startprice=100.00; biddable=96.07; prdline.my=51.37; D.T.like=29.39; D.T.use=25.43; D.T.box=22.27; D.T.veri=; D.T.scratch=;
oobssmpl: -> Low.cor.X.glm: Leaderboard: 0.83402 newobs_tbl=[N=440, Y=358]; submit_filename=oobsmpl_Final_glm_submit OOB_conf_mtrx=[YN=114, NY=84]=198; max.Accuracy.OOB=0.7780; opt.prob.threshold.OOB=0.5 startprice=100.00; biddable=93.87; prdline.my=60.48; D.sum.TfIdf=; D.T.condition=8.69; D.T.screen=7.96; D.T.use=7.50; D.T.veri=; D.T.scratch=;
category: -> Low.cor.X.glm: Leaderboard: 0.82381 newobs_tbl=[N=470, Y=328]; submit_filename=category_Final_glm_submit OOB_conf_mtrx=[YN=119, NY=57]=176; max.Accuracy.OOB=0.8011; opt.prob.threshold.OOB=0.6 startprice=100.00; biddable=79.19; prdline.my=55.22; D.sum.TfIdf=; D.T.ipad=27.05; D.T.like=21.44; D.T.box=20.67; D.T.condition=; D.T.screen=;
dataclns: -> All.X.no.rnorm.rf: Leaderboard: 0.82211 newobs_tbl=[N=485, Y=313]; submit_filename=dataclns_Final_rf_submit OOB_conf_mtrx=[YN=104, NY=75]=179; max.Accuracy.OOB=0.7977; opt.prob.threshold.OOB=0.5 startprice.log=100.00; biddable=65.85; prdline.my=7.74; D.sum.TfIdf=; D.T.use=2.01; D.T.condition=1.87; D.T.veri=1.62; D.T.ipad=; D.T.like=; Low.cor.X.glm: Leaderboard: 0.79264 newobs_tbl=[N=460, Y=338]; submit_filename=dataclns_Low_cor_X_glm_submit OOB_conf_mtrx=[YN=113, NY=74]=187; max.Accuracy.OOB=0.7977; opt.prob.threshold.OOB=0.5 -> different from prev run of 0.6 biddable=100.00; startprice.log=91.85; prdline.my=38.34; D.sum.TfIdf=; D.T.ipad=29.92; D.T.box=27.76; D.T.work=25.79; D.T.use=; D.T.condition=;
txtterms: -> top_n = c(10) Low.cor.X.glm: Leaderboard: 0.81448 newobs_tbl=[N=442, Y=356]; submit_filename=txtterms_Final_glm_submit OOB_conf_mtrx=[YN=113, NY=69]=182; max.Accuracy.OOB=0.7943; opt.prob.threshold.OOB=0.5 biddable=100.00; startprice.log=90.11; prdline.my=37.65; D.sum.TfIdf=; D.T.ipad=28.67; D.T.work=24.90; D.T.great=21.44; # [1] “D.T.condit” “D.T.condition” “D.T.good” “D.T.ipad” “D.T.new”
# [6] “D.T.scratch” “D.T.screen” “D.T.this” “D.T.use” “D.T.work”
All.X.glm: Leaderboard: 0.81016
newobs_tbl=[N=445, Y=353]; submit_filename=txtterms_Final_glm_submit
OOB_conf_mtrx=[YN=108, NY=72]=180; max.Accuracy.OOB=0.7966;
opt.prob.threshold.OOB=0.5
biddable=100.00; startprice.log=88.24; prdline.my=33.81; D.sum.TfIdf=;
D.T.scratch=25.51; D.T.use=18.97; D.T.good=16.37;
[1] “D.T.condit” “D.T.use” “D.T.scratch” “D.T.new” “D.T.good” “D.T.screen” [7] “D.T.great” “D.T.excel” “D.T.work” “D.T.ipad”
Max.cor.Y.rpart: Leaderboard: 0.79258
newobs_tbl=[N=439, Y=359]; submit_filename=txtterms_Final_rpart_submit
OOB_conf_mtrx=[YN=105, NY=76]=181; max.Accuracy.OOB=0.7954802;
opt.prob.threshold.OOB=0.5
startprice.log=100; biddable=; prdline.my=; D.sum.TfIdf=;
D.T.scratch=; D.T.use=; D.T.good=;
[1] “D.T.condit” “D.T.use” “D.T.scratch” “D.T.new” “D.T.good” “D.T.screen” [7] “D.T.ipad” “D.T.great” “D.T.work” “D.T.excel”
All.X.no.rnorm.rf: Leaderboard: 0.80929
newobs_tbl=[N=545, Y=253]; submit_filename=txtterms_Final_rf_submit
OOB_conf_mtrx=[YN=108, NY=61]=169; max.Accuracy.OOB=0.8090395
opt.prob.threshold.OOB=0.5
startprice.log=100.00; biddable=78.82; idseq.my=63.43; prdline.my=45.57;
D.T.use=2.76; D.T.condit=2.35; D.T.scratch=2.00; D.T.good=;
[1] “D.T.condit” “D.T.use” “D.T.scratch” “D.T.new” “D.T.good” “D.T.screen” [7] “D.T.ipad” “D.T.great” “D.T.work” “D.T.excel”
txtclstr: All.X.no.rnorm.rf: Leaderboard: 0.79363 -> 0.79573 newobs_tbl=[N=537, Y=261]; submit_filename=txtclstr_Final_rf_submit OOB_conf_mtrx=[YN=104, NY=61]=165; max.Accuracy.OOB=0.8135593 opt.prob.threshold.OOB=0.5 startprice.log=100.00; biddable=79.99; idseq.my=64.94; prdline.my=4.14; prdline.my.clusterid=1.15; [1] “D.T.condit” “D.T.use” “D.T.scratch” “D.T.new” “D.T.good” “D.T.screen” [7] “D.T.ipad” “D.T.great” “D.T.work” “D.T.excel”
dupobs: All.X.no.rnorm.rf: Leaderboard: 0.79295 newobs_tbl=[N=541, Y=257]; submit_filename=dupobs_Final_rf_submit OOB_conf_mtrx=[YN=114, NY=65]=179; max.Accuracy.OOB=0.7977401 opt.prob.threshold.OOB=0.5 startprice.log=100.00; biddable=94.49; idseq.my=67.40; prdline.my=4.48; prdline.my.clusterid=1.99; [1] “D.T.condit” “D.T.use” “D.T.scratch” “D.T.new” “D.T.good” “D.T.screen” [7] “D.T.ipad” “D.T.great” “D.T.work” “D.T.excel”
All.X.no.rnorm.rf: Leaderboard: 0.79652
newobs_tbl=[N=523, Y=275]; submit_filename=dupobs_Final_rf_submit
OOB_conf_mtrx=[YN=114, NY=65]=179; max.Accuracy.OOB=0.7977401
opt.prob.threshold.OOB=0.5
startprice.log=100.00; biddable=94.24; idseq.my=67.92;
prdline.my=4.33; prdline.my.clusterid=2.17;
[1] “D.T.condit” “D.T.use” “D.T.scratch” “D.T.new” “D.T.good” “D.T.screen” [7] “D.T.ipad” “D.T.great” “D.T.work” “D.T.excel”
csmmdl: All.X.no.rnorm.rf: Leaderboard: 0.79396 newobs_tbl=[N=525, Y=273]; submit_filename=csmmdl_Final_rf_submit OOB_conf_mtrx=[YN=111, NY=66]=177; max.Accuracy.OOB=0.8000000 opt.prob.threshold.OOB=0.5 startprice.log=100.00; biddable=90.30; idseq.my=67.06; prdline.my=4.40; cellular.fctr=3.57; prdline.my.clusterid=2.08;
All.Interact.X.no.rnorm.rf: Leaderboard: 0.77867 newobs_tbl=[N=564, Y=234]; submit_filename=csmmdl_Final_rf_submit OOB_conf_mtrx=[YN=120, NY=53]=173; max.Accuracy.OOB=0.8045198 opt.prob.threshold.OOB=0.5 biddable=100.00; startprice.log=93.99; idseq.my=57.30; prdline.my=9.09; cellular.fctr=3.30; prdline.my.clusterid=2.35;
All.Interact.X.no.rnorm.rf: Leaderboard: 0.77152 newobs_tbl=[N=539, Y=259]; submit_filename=csmmdl_Final_rf_submit OOB_conf_mtrx=[YN=, NY=]=; max.Accuracy.OOB=0.8011299 opt.prob.threshold.OOB=0.5 biddable=100.00; startprice.log=94.93; idseq.my=57.12; prdline.my=9.29; cellular.fctr=3.20; prdline.my.clusterid=2.50; [1] “D.T.condit” “D.T.use” “D.T.scratch” “D.T.new” “D.T.good” “D.T.screen” [7] “D.T.ipad” “D.T.great” “D.T.work” “D.T.excel”
All.X.glmnet:
fit_RMSE=???; OOB_RMSE=115.1247; new_RMSE=115.1247;
prdline.my.fctr=100.00; condition.fctrNew=88.53; D.npnct09.log=84.34
biddable=16.48; idseq.my=57.27;
spdiff:
All.Interact.X.no.rnorm.rf: Leaderboard: 0.78218 newobs_tbl=[N=517, Y=281]; submit_filename=spdiff_Final_rf_submit OOB_conf_mtrx=[YN=121, NY=38]=159; max.Accuracy.OOB=0.8203390 opt.prob.threshold.OOB=0.6 biddable=100.00; startprice.diff=57.53; idseq.my=41.31; prdline.my=11.43; cellular.fctr=2.36; prdline.my.clusterid=1.82;
All.X.no.rnorm.rf:
fit_RMSE=92.19; OOB_RMSE=130.86; new_RMSE=130.86;
biddable=100.00; prdline.my.fctr=61.92; idseq.my=57.77;
condition.fctr=29.53; storage.fctr=11.22; color.fctr=6.69;
cellular.fctr=6.11
All.X.no.rnorm.rf: Leaderboard: 0.77443
newobs_tbl=[N=606, Y=192]; submit_filename=spdiff_Final_rf_submit
OOB_conf_mtrx=[YN=112, NY=28]=140; max.Accuracy.OOB=0.8418079
opt.prob.threshold.OOB=0.6
startprice.diff=100.00; biddable=96.53; idseq.my=38.10;
prdline.my=3.65; cellular.fctr=2.21; prdline.my.clusterid=0.91;
[1] “D.T.condit” “D.T.use” “D.T.scratch” “D.T.new” “D.T.good” “D.T.screen” [7] “D.T.ipad” “D.T.great” “D.T.work” “D.T.excel”
color: All.Interact.X.glmnet: fit_RMSE=88.64520; prdline.my.fctr:D.TfIdf.sum.stem.stop.Ratio=100.00; prdline.my.fctr:condition.fctr=77.35 D.TfIdf.sum.stem.stop.Ratio=68.18 prdline.my.fctr:color.fctr=68.12 prdline.my.fctr:storage.fctr=63.32
All.X.no.rnorm.rf: Leaderboard: 0.80638
newobs_tbl=[N=550, Y=248]; submit_filename=color_Final_rf_submit
OOB_conf_mtrx=[YN=108, NY=54]=162; max.Accuracy.OOB=0.8169492
opt.prob.threshold.OOB=0.5
biddable=100.00; startprice.diff=77.90; idseq.my=48.49;
D.ratio.sum.TfIdf.nwrds=6.48; storage.fctr=4.74;
D.TfIdf.sum.stem.stop.Ratio=4.57; prdline.my=4.32;
[1] “D.T.condit” “D.T.use” “D.T.scratch” “D.T.new” “D.T.good” “D.T.screen” [7] “D.T.ipad” “D.T.great” “D.T.work” “D.T.excel”
All.Interact.X.no.rnorm.rf: Leaderboard: 0.72974
newobs_tbl=[N=682, Y=116]; submit_filename=assctxt_Final_rf_submit
OOB_conf_mtrx=[YN=125, NY=43]=168; max.Accuracy.OOB=0.8101695; max.auc.OOB=???;
opt.prob.threshold.OOB=0.6
biddable=100.00; startprice.diff=51.04; idseq.my=29.51;
startprice.diff:biddable=28.70
prdline.my.fctriPadmini:idseq.my=6.89
Highest max.auc.OOB=???; for model:
gbm w/startprice.unit9: Final.glment: min.RMSE.fit=30.32782 Ensemble.glmnet: min.RMSE.fit=29.62348 startprice.predict.All.Interact.X.no.rnorm.rf 100.000 startprice.predict.All.X.no.rnorm.rf 73.521 startprice.predict.All.Interact.X.bayesglm 29.675 startprice.predict.Max.cor.Y.lm 28.405
All.X.glmnet: min.RMSE.fit=88.64271
prdl.my.descr.fctr 100.00 D.TfIdf.sum.stem.stop.Ratio 85.01 condition.fctr 80.28 carrier.fctr 77.48 prdl.my.descr.fctr:.clusterid.fctr5 65.78 D.npnct16.log 61.66 startprice.unit9 59.48 color.fctr 59.21 D.npnct01.log 53.78 D.npnct08.log 53.56 cellular.fctr 53.19
Ensemble.glmnet: Leaderboard: not submitted -> lower max.auc.OOB of "Ensemble submission"
newobs_tbl=[N=579, Y=219]; submit_filename=gbm_Final_glmnet_submit
OOB_conf_mtrx=[YN=85, NY=54]=139;
max.Accuracy.OOB=0.8438202; max.auc.OOB=0.9127314; opt.prob.threshold.OOB=0.5
sold.fctr.predict.All.X.no.rnorm.rf.prob 100.0000 sold.fctr.predict.All.Interact.X.no.rnorm.rf.prob 98.7937
Highest max.auc.OOB=0.9167568; for model:All.Interact.X.gbm biddable 100.0000 startprice.diff 96.2076 startprice.diff:biddable 23.2114 idseq.my 7.8098
mdlsel: Final.glment: min.RMSE.fit=30.47114 (higher than gbm w/startprice.unit9) Ensemble.glmnet: min.RMSE.fit=29.49418 startprice.predict.All.Interact.X.no.rnorm.rf 100.000000 startprice.predict.All.X.no.rnorm.rf 71.213880 startprice.predict.All.X.bayesglm 24.166084
All.X.glmnet: min.RMSE.fit=88.64271
prdl.my.descr.fctr 100.00 D.TfIdf.sum.stem.stop.Ratio 85.01 condition.fctr 80.28 carrier.fctr 77.48 prdl.my.descr.fctr:.clusterid.fctr5 65.78 D.npnct16.log 61.66 startprice.unit9 59.48 color.fctr 59.21 D.npnct01.log 53.78 D.npnct08.log 53.56 cellular.fctr 53.19
mdlsel(startprice.log): Final.Ensemble.rf: min.RMSE.fit=0.4563772 Ensemble.rf: min.RMSE.fit=0.4283013 startprice.log.predict.All.Interact.X.no.rnorm.rf 100.0000000 startprice.log.predict.All.X.no.rnorm.rf 58.0967582 startprice.log.predict.All.Interact.X.gbm 6.7197148
All.X.no.rnorm.rf: min.RMSE.fit=1.4967021
biddable 100.00000000 idseq.my 98.00292371 startprice.unit9 34.31130220 prdl.my.descr.fctr 18.10984741 D.ratio.sum.TfIdf.nwrds 15.23549621 color.fctrUnknown 14.05520993 D.TfIdf.sum.stem.stop.Ratio 13.00884673 D.ratio.nstopwrds.nwrds 10.51165302
All.X.gbm: Leaderboard: 0.75430
newobs_tbl=[N=582, Y=216]; submit_filename=mdlsel_Final_gbm_submit
OOB_conf_mtrx=[YN=58, NY=65]=123;
max.Accuracy.OOB=0.8617978; max.auc.OOB=0.9367161;
opt.prob.threshold.OOB=0.5
startprice.diff 100.0000000 100.00000000 biddable 66.6475055 65.40764971 idseq.my 1.8632456 4.55963698
splogdiff: All.X.gbm: Leaderboard: 0.70111 newobs_tbl=[N=553, Y=245]; submit_filename=splogdiff_Final_gbm_submit OOB_conf_mtrx=[YN=35, NY=101]=136; max.Accuracy.OOB=0.8471910; max.auc.OOB=0.9388912; opt.prob.threshold.OOB=0.3 startprice.log.diff 100.0000000 100.0000000 biddable 86.8563123 88.0261866 idseq.my 8.3580281 2.9054298
Forum Ideas: I then focused on feature engineering, each new variable brought its own little improvement so in the end i just kept adding new ones and let the models do their thing. Here are some i used: model (productline:storage:condition), isNew, model2 (product:isNew), 50 common words from descr, descrLength, capsFactor (% of caps in description), number of cheaper items of same model2, number of dearer items of same model2, priceFactor (vs. mean of price for model), priceFactor2 (vs. mean of price for model2), bigID (if ID> 11000 because there seems to be a huge drop in sales after some time), timeline (year of product launch, reasoning is you want to spend less money on older products).
Get the median startprice for each level of productline and condition. Take the difference from startprice as a new variable. I find median works much better than the mean since startprice is not normally distributed. I also created another binary variable on whether this difference is positive or negative.
Square root startprice
scale and center all the variables except sold, including the dummies.
Use plot.ly for interactive plots ?
varImp for randomForest crashes in caret version:6.0.41 -> submit bug report
extensions toward multiclass classification are scheduled for the next release
glm_dmy_mdl should use the same method as glm_sel_mdl until custom dummy classifer is implemented
rm(list = ls())
set.seed(12345)
options(stringsAsFactors = FALSE)
source("~/Dropbox/datascience/R/myscript.R")
source("~/Dropbox/datascience/R/mydsutils.R")
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
source("~/Dropbox/datascience/R/myplot.R")
source("~/Dropbox/datascience/R/mypetrinet.R")
source("~/Dropbox/datascience/R/myplclust.R")
source("~/Dropbox/datascience/R/mytm.R")
# Gather all package requirements here
suppressPackageStartupMessages(require(doMC))
registerDoMC(6) # # of cores on machine - 2
suppressPackageStartupMessages(require(caret))
source("~/Documents/Work/PullRequests/caret/pkg/caret/R/confusionMatrix.R")
source("~/Documents/Work/PullRequests/caret/pkg/caret/R/ggplot.R")
#packageVersion("tm")
#require(sos); findFn("cosine", maxPages=2, sortby="MaxScore")
# Analysis control global variables
# Inputs
glb_trnng_url <- "https://inclass.kaggle.com/c/15-071x-the-analytics-edge-summer-2015/download/eBayiPadTrain.csv"
glb_newdt_url <- "https://inclass.kaggle.com/c/15-071x-the-analytics-edge-summer-2015/download/eBayiPadTest.csv"
glbInpMerge <- # NULL #: default
list(fnames = c("ebayipads_finmdl_bid0_sp_out.csv", "ebayipads_mdlens_bid1_sp_out.csv"))
glb_is_separate_newobs_dataset <- TRUE # or TRUE
glb_split_entity_newobs_datasets <- FALSE # select from c(FALSE, TRUE)
glb_split_newdata_method <- NULL # select from c(NULL, "condition", "sample", "copy")
glb_split_newdata_condition <- NULL # or "is.na(<var>)"; "<var> <condition_operator> <value>"
glb_split_newdata_size_ratio <- 0.3 # > 0 & < 1
glb_split_sample.seed <- 123 # or any integer
glbObsDropCondition <- # default : NULL
"(UniqueID %in% c(NULL
, 11234 #sold=0; 2 other dups(10306, 11503) are sold=1
, 11844 #sold=0; 3 other dups(11721, 11738, 11812) are sold=1
)) |
(productline %in% c('iPad 5', 'iPad mini Retina')) |
(biddable != 0) # bid0_sp
# | (biddable == 0) # bid1_sp
"
#parse(text=glbObsDropCondition)
#subset(glb_allobs_df, .grpid %in% c(31))
glb_obs_repartition_train_condition <- NULL
# "!is.na(sold) & (sold == 1)" # : bid._sp
glb_max_fitobs <- NULL # or any integer
glb_is_regression <- FALSE; glb_is_classification <- !glb_is_regression;
glb_is_binomial <- TRUE #or FALSE
glb_rsp_var_raw <- "sold" #: !_sp # "startprice" # : bid._sp #
# for classification, the response variable has to be a factor
glb_rsp_var <- "sold.fctr" #:!_sp # "startprice.log10" :bid._sp # glb_rsp_var_raw :default
# if the response factor is based on numbers/logicals e.g (0/1 OR TRUE/FALSE vs. "A"/"B"),
# or contains spaces (e.g. "Not in Labor Force")
# caret predict(..., type="prob") crashes
glb_map_rsp_raw_to_var <- function(raw) { # NULL
# return(raw ^ 0.5)
# return(log(1 + raw))
# return(log10(raw)) # bid._sp
# return(exp(-raw / 2))
ret_vals <- rep_len(NA, length(raw)); ret_vals[!is.na(raw)] <- ifelse(raw[!is.na(raw)] == 1, "Y", "N"); return(relevel(as.factor(ret_vals), ref="N"))
# #as.factor(paste0("B", raw))
# #as.factor(gsub(" ", "\\.", raw))
}
# glb_map_rsp_raw_to_var(tst <- c(NA, 0, 1)) # !_sp
# glb_map_rsp_raw_to_var(tst <- c(NA, 0, 2.99, 280.50, 1000.00)) # bid._sp
glb_map_rsp_var_to_raw <- function(var) { # NULL #
# return(var ^ 2.0)
# return(exp(var) - 1)
# return(10 ^ var) # bid._sp
# return(-log(var) * 2)
as.numeric(var) - 1
# #as.numeric(var)
# #gsub("\\.", " ", levels(var)[as.numeric(var)])
# c("<=50K", " >50K")[as.numeric(var)]
# #c(FALSE, TRUE)[as.numeric(var)]
}
# glb_map_rsp_var_to_raw(glb_map_rsp_raw_to_var(tst))
if ((glb_rsp_var != glb_rsp_var_raw) && is.null(glb_map_rsp_raw_to_var))
stop("glb_map_rsp_raw_to_var function expected")
glb_rsp_var_out <- paste0(glb_rsp_var, ".predict.") # mdl_id is appended later
# List info gathered for various columns
# <col_name>: <description>; <notes>
# description = The text description of the product provided by the seller.
# biddable = Whether this is an auction (biddable=1) or a sale with a fixed price (biddable=0).
# startprice = The start price (in US Dollars) for the auction (if biddable=1) or the sale price (if biddable=0).
# condition = The condition of the product (new, used, etc.)
# cellular = Whether the iPad has cellular connectivity (cellular=1) or not (cellular=0).
# carrier = The cellular carrier for which the iPad is equipped (if cellular=1); listed as "None" if cellular=0.
# color = The color of the iPad.
# storage = The iPad's storage capacity (in gigabytes).
# productline = The name of the product being sold.
# If multiple vars are parts of id, consider concatenating them to create one id var
# If glb_id_var == NULL, ".rownames <- row.names()" is the default
# Derive a numeric feature from id var
# User-specified exclusions
# List feats that shd be excluded due to known causation by prediction variable
glbFeatsExclude <- c(NULL
### !_sp
, "description", "descr.my", "productline"
, "startprice", "startprice.log10.predict", "sprice.predict.diff"
### bid0_sp
# , "description", "productline"
# , "sold", "startprice.log10.cut.fctr"
# # List feats that are linear combinations (alias in glm)
# , "D.terms.post.stem.n.log", "D.weight.sum"
# #, "prdl.descr.my.fctriPad4#1:.clusterid.fctr3" This does not work
# # if RFE is rated lower than Low.cor, list feats that are in RFE & not in Low.cor
# # min.RMSE.fit(RFE.X.glmnet)=0.1138888
# # D.chrs.n.log 61.12483
# # D.chrs.uppr.n.log 61.12483
# # D.ratio.wrds.stop.n.wrds.n 61.12483
# # D.terms.post.stop.n.log 61.12483
# # D.weight.post.stem.sum 61.12483
# # D.wrds.n.log 61.12483
# # D.wrds.stop.n.log 61.12483
# # D.wrds.unq.n.log 61.12483
# #, "startprice.dcm2.is9" # min.RMSE.fit(RFE.X.glmnet)=0.1141991 (up)
# , "D.wrds.stop.n.log" # min.RMSE.fit(RFE.X.glmnet)=0.1131232
### bid0_sp
### bid1_sp
# , "description", "productline"
# , "sold", "startprice.log10.cut.fctr"
### bid1_sp
)
glb_id_var <- c("UniqueID")
glb_category_var <- "prdl.descr.my.fctr" # "productline" # NULL
glb_drop_vars <- c(NULL) # or c("<col_name>")
glb_map_vars <- NULL # or c("<var1>", "<var2>")
glb_map_urls <- list();
# glb_map_urls[["<var1>"]] <- "<var1.url>"
glb_assign_pairs_lst <- NULL;
# glb_assign_pairs_lst[["<var1>"]] <- list(from=c(NA),
# to=c("NA.my"))
glb_assign_vars <- names(glb_assign_pairs_lst)
# Derived features
glbFeatsDerive <- NULL;
# Add logs of numerics that are not distributed normally -> do automatically ???
# Right skew: logp1; sqrt; ^ 1/3; logp1(logp1); log10; exp(-<feat>/constant)
# glbFeatsDerive[["prdline.my"]] <- list(
# mapfn=function(productline) { return(productline) }
# , args=c("productline"))
### bid._sp
# glbFeatsDerive[["startprice.log10.cut.fctr"]] <- list(
# mapfn=function(startprice.log10) { return(cut(startprice.log10, 3)) }
# , args=c("startprice.log10"))
### bid._sp
glbFeatsDerive[["sprice.root2"]] <- list(
mapfn = function(startprice) { return(startprice ^ (1/2)) }
, args = c("startprice"))
glbFeatsDerive[["sprice.log10"]] <- list(
mapfn = function(startprice) { return(log(startprice)) }
, args = c("startprice"))
glbFeatsDerive[["sprice.d20nexp"]] <- list(
mapfn = function(startprice) { return(exp(-startprice / 20)) }
, args = c("startprice"))
glbFeatsDerive[["sprice.predict.diff"]] <- list(
mapfn = function(startprice.log10.predict, startprice) {
spdiff <- (10 ^ startprice.log10.predict) - startprice;
return(spdiff) }
, args = c("startprice.log10.predict", "startprice"))
# glbFeatsDerive[["spdiff.root10"]] <- list(
# mapfn = function(sprice.predict.diff) {
# return(sign(sprice.predict.diff) * (abs(sprice.predict.diff) ^ (1/10))) }
# , args = c("sprice.predict.diff"))
glbFeatsDerive[["spdiff.cut.fctr"]] <- list(
mapfn = function(sprice.predict.diff) {
return(cut(sprice.predict.diff, c(-1000, -100, -10, -1, 0, 1, 10, 100, 1000))) }
, args = c("sprice.predict.diff"))
#glb_allobs_df[which(glb_post_stop_words_terms_mtrx_lst[[txt_var]][, subset(glb_post_stop_words_terms_df_lst[[txt_var]], term %in% c("conditionminimal"))$pos] > 0), "description"]
glbFeatsDerive[["descr.my"]] <- list(
mapfn = function(description) { mod_raw <- description;
### bid._sp
# # This is here because it does not work with txt_map_filename
mod_raw <- gsub(paste0(c("\n", "\211", "\235", "\317", "\333"), collapse = "|"), " ",
mod_raw)
# # This should go into txt_map_filename
# mod_raw <- gsub("\\.\\.", "\\. ", mod_raw);
# # Don't parse for "." because of ".com"; use customized gsub for that text
# mod_raw <- gsub("(\\w)(!|\\*|,|-|/)(\\w)", "\\1\\2 \\3", mod_raw);
#mod_raw <- grep(""", glb_allobs_df$descr.my, value = TRUE)
mod_raw <- gsub("&", "&", mod_raw);
mod_raw <- gsub("<", "<", mod_raw);
mod_raw <- gsub(">", ">", mod_raw);
mod_raw <- gsub("<br>", " ", mod_raw); # line break - add a count for it ???
mod_raw <- gsub(""", " ", mod_raw); # justification meta-character
mod_raw <- gsub("&#(0*)37;", "%", mod_raw);
mod_raw <- gsub("'", "'", mod_raw);
mod_raw <- gsub("([[:digit:]])\\.([[:digit:]])\\.([[:digit:]])",
"\\1point\\2\\point\\3", mod_raw);
mod_raw <- gsub("([[:digit:]])\\.([[:digit:]])", "\\1point\\2", mod_raw);
mod_raw <- gsub("([[:digit:]]),([[:digit:]])", "\\1\\2", mod_raw);
mod_raw <- gsub("\\b1st\\b", "first", mod_raw);
mod_raw <- gsub("\\b2nd\\b", "second", mod_raw);
mod_raw <- gsub("\\b3rd\\b", "third", mod_raw);
mod_raw <- gsub("\\b4th\\b", "fourth", mod_raw);
mod_raw <- gsub("\\.(com|COM)\\b", "dot\\1", mod_raw);
#
# # Modifications for this exercise only
# # Add dictionary to stemDocument e.g. stickers stemmed to sticker ???
# mod_raw <- gsub("8\\.25", "825", mod_raw, ignore.case=TRUE);
mod_raw <- gsub("\\b10\\.SCREEN\\b", "10\\. SCREEN", mod_raw);
mod_raw <- gsub("\\b128 gb\\b", "128gb", mod_raw);
mod_raw <- gsub("\\b16G\\b", "16GB", mod_raw);
# mod_raw <- gsub(" 16 gig ", " 16gb ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" 16 gb ", " 16gb ", mod_raw, ignore.case=TRUE);
#
# mod_raw <- gsub("\\bAccounts\\b", "Account", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub("\\bactivated\\b", "activate", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub(" actuuly ", " actual ", mod_raw, ignore.case=TRUE);
mod_raw <- gsub("\\badaptor\\b", "adapter", mod_raw);
# mod_raw <- gsub("\\baffects\\b", "affect", mod_raw, ignore.case=FALSE);
mod_raw <- gsub("\\bair-like\\b", "air -like", mod_raw);
mod_raw <- gsub("\\bALL-JUST\\b", "ALL -JUST", mod_raw);
mod_raw <- gsub("\\bApple's\\b", "Apple'", mod_raw);
# #mod_raw <- glb_allobs_df[c(1322), txt_var]; mod_raw
mod_raw <- gsub("\\bApple care\\b", "Applecare", mod_raw);
mod_raw <- gsub("\\bAT&T\\b", "ATT", mod_raw);
# mod_raw <- gsub(" bacK!wiped ", " bacK ! wiped ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" backplate", " back plate", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bbarley", "barely", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" bend ", " bent ", mod_raw, ignore.case=TRUE);
mod_raw <- gsub("\\b(B|b)(EST|est) (B|b)(UY|uy)\\b", "\\1\\2\\3\\4", mod_raw);
# mod_raw <- gsub(" black\\.Device ", " black \\. Device ", mod_raw,
# ignore.case=TRUE);
# mod_raw <- gsub("black\\),charger ", "black\\), charger ", mod_raw,
# ignore.case=TRUE);
# mod_raw <- gsub("\\bblacked\\b", "black", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub("\\bblemish\\b", "blemishes", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub(" blocks", " blocked", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" book ", " manual ", mod_raw, ignore.case=TRUE);
mod_raw <- gsub("\\b(B|b)(RAND|rand)( |-)(N|n)(EW|ew)\\b", "\\1\\2\\4\\5", mod_raw)
#mod_raw <- c("brand new", "BRAND new", "brand NEW", "BRAND NEW", "bbrand new", "brand-new", "brand newb")
mod_raw <- gsub("\\bbrokenCharger\\b", "broken Charger", mod_raw);
#
mod_raw <- gsub("\\bC-Major\\b", "C -Major", mod_raw)
# mod_raw <- gsub(" perfectlycord ", " perfectly cord ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bcord", "cable", mod_raw, ignore.case=TRUE);
mod_raw <- gsub("\\bcables\\.No\\b", "cables. No", mod_raw);
# mod_raw <- gsub("\\bcables\\b", "cable", mod_raw, ignore.case=TRUE);
#
mod_raw <- gsub("\\bcare\\.The\\b", "care\\. The", mod_raw);
# mod_raw <- gsub("\\b(cared|careful|CAREFUL)\\b", "care", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub("\\b(cases|casing)\\b", "case", mod_raw, ignore.case=TRUE);
# #mod_raw <- glb_allobs_df[c(88,187,280,1040,1098), txt_var]; mod_raw
mod_raw <- gsub("\\bCase/Cover\\b", "Case/ Cover", mod_raw);
mod_raw <- gsub("\\bCasing/Screen\\b", "Casing/ Screen", mod_raw);
# mod_raw <- gsub(" carefully ", " careful ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bchargers\\b", "charger", mod_raw, ignore.case=FALSE);
mod_raw <- gsub("\\bchip/crack\\b", "chip/ crack", mod_raw);
# mod_raw <- gsub("\\bchips\\b", "chip", mod_raw, ignore.case=FALSE);
mod_raw <- gsub("\\bCLEAN\\!LIKE\\b", "CLEAN! LIKE", mod_raw);
# mod_raw <- gsub("\\bcleanly\\b", "clean", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub("\\b(C|c)olor(.*)s\\b", "\\1olor", mod_raw, ignore.case=FALSE);
# #mod_raw <- glb_allobs_df[c(280,1411), txt_var]; mod_raw
mod_raw <- gsub("\\bColors,models\\b", "Colors ,models", mod_raw);
# mod_raw <- gsub("\\bcompletely\\b", "complete", mod_raw, ignore.case=FALSE);
# #mod_raw <- glb_allobs_df[c(178), txt_var]; mod_raw
#
mod_raw <- gsub("\\bCONDITION..CLEAN\\b", "CONDITION ..CLEAN", mod_raw);
mod_raw <- gsub("\\bcondition,comes\\b", "condition ,comes", mod_raw);
mod_raw <- gsub("\\bcondition\\.Device\\b", "condition .Device", mod_raw);
mod_raw <- gsub("\\bconditionHas\\b", "condition Has", mod_raw);
mod_raw <- gsub("\\bcondition\\.\\.\\.like\\b", "condition ...like", mod_raw);
mod_raw <- gsub("\\bcondition\\*Minimal\\b", "condition *Minimal", mod_raw);
mod_raw <- gsub("\\bCondition-Moderate\\b", "Condition -Moderate", mod_raw);
mod_raw <- gsub("\\bcondition\\.The\\b", "condition .The", mod_raw);
mod_raw <- gsub("\\bCONDITION\\.VERY\\b", "CONDITION .VERY", mod_raw);
# mod_raw <- gsub(" (conditon|condtion|contidion|conditions)", " condition", mod_raw,
# mod_raw <- gsub("\\b(conditon|condtion|contidion|conditions)\\b", "condition", mod_raw,
# ", "\\1\\. \\2", mod_raw,
# ignore.case=TRUE);
# mod_raw <- gsub("(condition)(Has)", "\\1\\. \\2", mod_raw);
#
# mod_raw <- gsub("\\bCONNECTED\\b", "CONNECT", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub("\\bconnects\\b", "connect", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub(" consist ", " consistent ", mod_raw, ignore.case=TRUE);
# #mod_raw <- glb_allobs_df[c(195, 379, 437), txt_var]; mod_raw
# mod_raw <- gsub("\\bCosmetics\\b", "Cosmetic", mod_raw, ignore.case=FALSE);
mod_raw <- gsub("\\bCracked/Damaged\\b", "Cracked/ Damaged", mod_raw);
mod_raw <- gsub("\\bcracksNo\\b", "cracks No", mod_raw);
#
# mod_raw <- gsub("\\b(D|d)amaged\\b", "\\1amage", mod_raw, ignore.case=TRUE);
# #mod_raw <- glb_allobs_df[c(116, 1360), txt_var]; mod_raw
# mod_raw <- gsub("\\bDays\\b", "Day", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" DEFAULTING ", " DEFAULT ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bdefect(ive)*\\b", "defects", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub(" definitely ", " definite ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\b(D|d)ented\\b", "\\1ent", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub(" described", " describe", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" desciption", " description", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" devices", " device", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" Digi\\.", " Digitizer\\.", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\b(ding|dinged)\\b", "dings", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" display\\.New ", " display\\. New ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" displays", " display", mod_raw, ignore.case=TRUE);
mod_raw <- gsub("\\bdo( +)not\\b", "dont", mod_raw);
mod_raw <- gsub("\\b(D|d)oes( +)(N|n)(O|o)(T|t)\\b", "\\1oes\\3\\5", mod_raw);
# mod_raw <- gsub("\\b(drop|drops)\\b", "dropped", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\b(E|e)dge\\b", "\\1dges", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub(" effect ", " affect ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" Excellant ", " Excellent ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" excellently", " excellent", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" EUC ", " excellent used condition", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" feels ", " feel ", mod_raw, ignore.case=TRUE);
mod_raw <- gsub("\\bfineiCloud\\b", "fine iCloud", mod_raw, ignore.case = FALSE);
# mod_raw <- gsub(" fine.Its ", " fine. Its ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bfix\\b", "fixed", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bflaws\\b", "flaw", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bflawlessly\\b", "flawless", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" Framing ", " Frame ", mod_raw, ignore.case=TRUE);
#
# mod_raw <- gsub(" functioanlity", " functionality", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bfunction(ing|ality)\\b", "functional", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" functional\\.Very little ", " functional\\. Very little ", mod_raw,
# ignore.case=TRUE);
mod_raw <- gsub("\\b([[:digit:]]+) (GB|gb)\\b", "\\1\\2", mod_raw);
mod_raw <- gsub("\\b([[:digit:]]+) gig\\b", "\\1gb", mod_raw);
mod_raw <- gsub("\\b(G|g)(EEK|eek) (S|s)(QUAD|quad)\\b", "\\1\\2\\3\\4", mod_raw);
# mod_raw <- gsub("^Gentle ", "Gently ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\(gray color", "\\(spacegray color", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" GREAT\\.SCreen ", " GREAT\\. SCreen ", mod_raw,
# ignore.case=TRUE);
mod_raw <- gsub("\\bGUARANTEES-IT\\b", "GUARANTEES -IT", mod_raw);
# mod_raw <- gsub("\\b(guarantee|guarantees)\\b", "guaranteed", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\ba handful of times\\b", "sparingly", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bhardly any\\b", "no", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bhardly ever used\\b", "sparingly used", mod_raw, ignore.case=TRUE);
#
mod_raw <- gsub("\\biCL0UD\\b", "iCLOUD", mod_raw);
mod_raw <- gsub("\\bI (CLOUD|cloud)\\b", "I\\1", mod_raw);
# mod_raw <- gsub("^iPad Black 3rd generation ", "iPad 3 Black ", mod_raw,
# ignore.case=TRUE);
mod_raw <- gsub("\\bIMEINo\\b", "IMEI No", mod_raw);
mod_raw <- gsub("\\bIMIE\\b", "IMEI", mod_raw);
# mod_raw <- gsub("\\bincluding\\b", "included", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub(" install\\. ", " installed\\. ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("inivisible", "invisible", mod_raw, ignore.case=TRUE);
mod_raw <- gsub("\\bI pad\\b", "Ipad", mod_raw);
mod_raw <- gsub("\\b(I|i)(P|p)(A|a)(D|d) (A|a)(I|i)(R|r)\\b", "\\1\\2\\3\\4\\5\\6\\7",
mod_raw);
mod_raw <- gsub("\\b(I|i)(P|p)(A|a)(D|d) (M|m)ini\\b", "\\1\\2\\3\\4\\5ini", mod_raw);
mod_raw <- gsub("\\b(I|i)(P|p)(A|a)(D|d) (M|m)inis\\b", "\\1\\2\\3\\4\\5ini", mod_raw);
mod_raw <- gsub("\\b(IPAD|Ipad|iPad|ipad) ([[:digit:]])\\b", "\\1\\2", mod_raw);
mod_raw <- gsub("\\b(Ipadair|iPadAir|ipadair) ([[:digit:]])\\b", "\\1\\2",
mod_raw);
mod_raw <- gsub("\\b(iPadMini|iPadmini) ([[:digit:]])\\b", "\\1\\2", mod_raw);
mod_raw <- gsub("\\bI Phone\\b", "IPhone", mod_raw);
mod_raw <- gsub("\\bIS-NO\\b", "IS -NO", mod_raw, ignore.case = FALSE)
#
# mod_raw <- gsub(" Keeped ", " Kept ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" knicks ", " nicks ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" lightening ", " lightning ", mod_raw, ignore.case=TRUE);
mod_raw <- gsub("\\bLightning-to-USB\\b", "Lightning- to- USB", mod_raw);
mod_raw <- gsub("\\b(L|l)(IKE|ike)( |-)(N|n)(EW|ew)\\b", "\\1\\2\\4\\5", mod_raw);
#mod_raw <- c("like new", "LIKE new", "like NEW", "LIKE NEW", "blike new", "like-new", "like newb")
mod_raw <- gsub("\\bLIKENEW!ONE\\b", "LIKENEW! ONE", mod_raw);
# mod_raw <- gsub("\\b(lock|locks)\\b", "locked", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\blots\\b", "lot", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" manuals ", " manual ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" mars ", " marks ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" marks\\.Absolutely ", " marks\\. Absolutely ", mod_raw,
# ignore.case=TRUE);
# mod_raw <- gsub("\\bmarkings\\b", "marks", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bminis\\b", "mini", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub(" minimum", " minimal", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" MINT\\.wiped ", " MINT\\. wiped ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bmonth\\b", "months", mod_raw, ignore.case=TRUE);
# #mod_raw <- glb_allobs_df[c(1803), txt_var]; mod_raw
mod_raw <- gsub("\\bNew-Other\\b", "New -Other", mod_raw);
# mod_raw <- gsub(" NEW\\!(SCREEN|ONE) ", " NEW\\! \\1 ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" new looking$", " looks new", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" newer ", " new ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bnoted\\b", "note", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" oped ", " opened ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" opening", " opened", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" operated", " operational", mod_raw, ignore.case=TRUE);
mod_raw <- gsub("\\botter box\\b", "otterbox", mod_raw);
#
# mod_raw <- gsub("\\bpackage\\b", "packaging", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub("\\bPACKAGE\\b", "PACKAGing", mod_raw, ignore.case=FALSE);
# #mod_raw <- glb_allobs_df[c(360, 1142), txt_var]; mod_raw
mod_raw <- gsub("\\bperfectlycord\\b", "perfectly cord", mod_raw);
# mod_raw <- gsub(" performance", " performs", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" personalized ", " personal ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bPhysically\\b", "Physical", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub("\\b(picture|pictured)\\b", "pictures", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub("\\bPICTURE\\b", "PICTUREs", mod_raw, ignore.case=FALSE);
# #mod_raw <- glb_allobs_df[c(184, 892), txt_var]; mod_raw
# mod_raw <- gsub("\\b[P|p]ower(ed|ing|s)\\b", "\\1ower", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub(" pre- owned ", " used ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bprevious\\b", "previously", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bpreviously (owned|used)\\b", "used", mod_raw, ignore.case=TRUE);
mod_raw <- gsub("\\b(P|p)reviously(.*)(O|o)wned\\b", "\\1reviously\\3wned\\2",
mod_raw);
mod_raw <- gsub("\\b(P|p)reviously(.*)(U|u)sed\\b", "\\1reviously\\3sed\\2", mod_raw);
# mod_raw <- gsub("\\bproblem\\b", "problems", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" products ", " product ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bprotected\\b", "protector", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub("\\bprotection\\b", "protector", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub("\\bPROTECTION\\b", "PROTECTOR", mod_raw, ignore.case=FALSE);
mod_raw <- gsub("\\bREADiPad\\b", "READ iPad", mod_raw);
# mod_raw <- gsub(" re- assemble ", " reassemble ", mod_raw, ignore.case=TRUE);
mod_raw <- gsub("\\bREFURB\\.", "REFURBished.", mod_raw);
# mod_raw <- gsub(" reponding", " respond", mod_raw, ignore.case=TRUE);
mod_raw <- gsub("\\bright-hand\\b", "right -hand", mod_raw);
# mod_raw <- gsub(" rotation ", " rotate ", mod_raw, ignore.case=TRUE);
#
# mod_raw <- gsub(" Sales ", " Sale ", mod_raw, ignore.case=TRUE);
mod_raw <- gsub("\\bScratch-Free\\b", "Scratch- Free", mod_raw);
mod_raw <- gsub("\\bSCRATCHES/BLEMISHES...SCRATCHES\\b", "SCRATCHES/ BLEMISHES... SCRATCHES", mod_raw);
mod_raw <- gsub("\\bscratches,clear\\b", "scratches, clear", mod_raw);
mod_raw <- gsub("\\bScratches/Dent\\b", "Scratches/ Dent", mod_raw);
mod_raw <- gsub("\\bScratches/scuffs/nicks/scrapes\\b", "Scratches/ scuffs/ nicks/ scrapes", mod_raw);
# mod_raw <- gsub(" scratchs ", " scratches ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\b(scratchs|scratching)\\b", "scratches", mod_raw, ignore.case=FALSE);
mod_raw <- gsub("\\bset up\\b", "setup", mod_raw);
# mod_raw <- gsub(" shipped| Shipment", " ship", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bshowing\\b", "shows", mod_raw, ignore.case=FALSE);
mod_raw <- gsub("\\b(shrink|SHRINK) (wrap|WRAP)", "\\1\\2", mod_raw);
# mod_raw <- gsub("\\bshuts\\b", "shut", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" sides ", " side ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" skinned,", " skin,", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bSlightly\\b", "slight", mod_raw, ignore.case=FALSE);
mod_raw <- gsub("\\b(Space|space) (G|g)r(a|e)y\\b", "\\1\\2ray", mod_raw);
# mod_raw <- gsub(" spec ", " speck ", mod_raw, ignore.case=TRUE);
mod_raw <- gsub("\\bsomescratches\\b", "some scratches", mod_raw);
# mod_raw <- gsub(" Sticker ", " Stickers ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bstoring", "store", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub("SWAPPA\\.COM", "SWAPPAsdotCOM", mod_raw, ignore.case=TRUE);
#
# mod_raw <- gsub(" T- Mobile", " TMobile", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\b(tear|TEAR)(s|S)\\b", "\\1", mod_raw, ignore.case=FALSE);
# #mod_raw <- glb_allobs_df[c(376), txt_var]; mod_raw
mod_raw <- gsub("\\b(touch|Touch|TOUCH) (screen|SCREEN)\\b", "\\1\\2", mod_raw);
# mod_raw <- gsub("\\bTURN\\b", "TURNS", mod_raw, ignore.case=FALSE);
#
mod_raw <- gsub("\\bUnlockedCracked\\b", "Unlocked Cracked", mod_raw);
# mod_raw <- gsub("\\bUNUSABLE\\b", "UNUSED", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub("\\b(update|updates)\\b", "updated", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub("\\bupgrade\\b", "upgraded", mod_raw, ignore.case=FALSE);
# mod_raw <- gsub(" uppser ", " upper ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("use*Case\\b", "use *Case", mod_raw, ignore.case = FALSE)
# mod_raw <- gsub(" use\\.Scratches ", " use\\. Scratches ", mod_raw,
# ignore.case=TRUE);
#
# mod_raw <- gsub(" verify ", " verified ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" wear\\.Device ", " wear\\. Device ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub("\\bwears\\b", "\\wear", mod_raw, ignore.case=TRUE);
# #mod_raw <- glb_allobs_df[c(167, 272), txt_var]; mod_raw
# mod_raw <- gsub(" whats ", " what's ", mod_raw, ignore.case=TRUE);
# mod_raw <- gsub(" WiFi\\+4G ", " WiFi \\+ 4G ", mod_raw, ignore.case=TRUE);
mod_raw <- gsub("\\b(W|w)(IFI|ifi)( |-)(ONLY|only)\\b", "\\1\\2\\4", mod_raw);
mod_raw <- gsub("\\bwill( +)not\\b", "wont", mod_raw);
mod_raw <- gsub("\\byr\\b", "year", mod_raw);
mod_raw <- gsub("\\bZa(a|g)g Invisible(.*)Shield\\b", "ZaagInvisibleShield", mod_raw);
### bid._sp
return(mod_raw) }
, args = c("description"))
# To identify glb_id_vars with >=10 obs
#mod_raw <- glb_allobs_df[sel_obs(list(descr.my.contains="\\bdoes( +)not\\b")), glbFeatsText]
#mod_raw <- glb_allobs_df[sel_obs(list(descr.my.contains="\\bipad [[:digit:]]\\b")), glbFeatsText][01:10]
#mod_raw <- glb_allobs_df[sel_obs(list(descr.my.contains="pad mini")), glbFeatsText][11:20]
#mod_raw <- glb_allobs_df[sel_obs(list(descr.my.contains="pad mini")), glbFeatsText][21:30]
#mod_raw <- glb_allobs_df[sel_obs(list(descr.my.contains="pad mini")), glbFeatsText][31:40]
glbFeatsDerive[["prdl.descr.my.fctr"]] <- list(
mapfn = function(productline, description) {
as.factor(paste(gsub(" ", "", productline), as.numeric(nchar(description) > 0),
sep = "#")) }
, args = c("productline", "description"))
#print(mycreate_sqlxtab_df(glb_allobs_df, c("prdl.descr.my.fctr", "sold")))
# mapfn=function(startprice) { return(scale(log(startprice))) }
# , args=c("startprice"))
# mapfn=function(Rasmussen) { return(ifelse(sign(Rasmussen) >= 0, 1, 0)) }
# mapfn=function(PropR) { return(as.factor(ifelse(PropR >= 0.5, "Y", "N"))) }
# mapfn=function(purpose) { return(relevel(as.factor(purpose), ref="all_other")) }
# mapfn=function(Week) { return(substr(Week, 1, 10)) }
# mapfn=function(raw) { tfr_raw <- as.character(cut(raw, 5));
# tfr_raw[is.na(tfr_raw)] <- "NA.my";
# return(as.factor(tfr_raw)) }
# , args=c("raw"))
# mapfn=function(PTS, oppPTS) { return(PTS - oppPTS) }
# , args=c("PTS", "oppPTS"))
# # If glb_allobs_df is not sorted in the desired manner
# mapfn=function(Week) { return(coredata(lag(zoo(orderBy(~Week, glb_allobs_df)$ILI), -2, na.pad=TRUE))) }
# mapfn=function(ILI) { return(coredata(lag(zoo(ILI), -2, na.pad=TRUE))) }
# mapfn=function(ILI.2.lag) { return(log(ILI.2.lag)) }
# glbFeatsDerive[["<txt_var>.niso8859.log"]] <- list(
# mapfn=function(<txt_var>) { match_lst <- gregexpr("&#[[:digit:]]{3};", <txt_var>)
# match_num_vctr <- unlist(lapply(match_lst,
# function(elem) length(elem)))
# return(log(1 + match_num_vctr)) }
# , args=c("<txt_var>"))
# mapfn=function(raw) { mod_raw <- raw;
# mod_raw <- gsub("&#[[:digit:]]{3};", " ", mod_raw);
# # Modifications for this exercise only
# mod_raw <- gsub("\\bgoodIn ", "good In", mod_raw);
# return(mod_raw)
# # Create user-specified pattern vectors
# #sum(mycount_pattern_occ("Metropolitan Diary:", glb_allobs_df$Abstract) > 0)
# if (txt_var %in% c("Snippet", "Abstract")) {
# txt_X_df[, paste0(txt_var_pfx, ".P.metropolitan.diary.colon")] <-
# as.integer(0 + mycount_pattern_occ("Metropolitan Diary:",
# glb_allobs_df[, txt_var]))
#summary(glb_allobs_df[ ,grep("P.on.this.day", names(glb_allobs_df), value=TRUE)])
# glb_allobs_df$<descriptor>.my <-
# plyr::revalue(glb_allobs_df$<descriptor>.my, c(
# "ABANDONED BUILDING" = "OTHER",
# "##" = "##"
# ))
# print(<descriptor>_freq_df <- mycreate_sqlxtab_df(glb_allobs_df, c("<descriptor>.my")))
# # print(dplyr::filter(<descriptor>_freq_df, grepl("(MEDICAL|DENTAL|OFFICE)", <descriptor>.my)))
# # print(dplyr::filter(dplyr::select(glb_allobs_df, -<var.zoo>),
# # grepl("STORE", <descriptor>.my)))
# glbFeatsDerive[["<var1>"]] <- glbFeatsDerive[["<var2>"]]
glb_derive_vars <- names(glbFeatsDerive)
# tst <- "descr.my"; args_lst <- NULL; for (arg in glbFeatsDerive[[tst]]$args) args_lst[[arg]] <- glb_allobs_df[, arg]; print(head(args_lst[[arg]])); print(head(drv_vals <- do.call(glbFeatsDerive[[tst]]$mapfn, args_lst)));
# print(which_ix <- which(args_lst[[arg]] == 0.75)); print(drv_vals[which_ix]);
glb_date_vars <- NULL # or c("<date_var>")
glb_date_fmts <- list(); #glb_date_fmts[["<date_var>"]] <- "%m/%e/%y"
glb_date_tzs <- list(); #glb_date_tzs[["<date_var>"]] <- "America/New_York"
#grep("America/New", OlsonNames(), value=TRUE)
glbFeatsPrice <- c("startprice") #: bid._sp # NULL or c("<price_var>")
# Text Processing Step: custom modifications not present in txt_munge
glbFeatsText <- c("descr.my") # NULL #
Sys.setlocale("LC_ALL", "C") # For english
## [1] "C/C/C/C/C/en_US.UTF-8"
# Text Processing Step: universal modifications
glb_txt_munge_filenames_pfx <- "ebay_mytxt_"
# Text Processing Step: tolower
# Text Processing Step: removePunctuation (use custom transformer to replace with space ???)
# Text Processing Step: removeWords
glb_txt_stop_words <- list()
# Remember to use unstemmed words; Check stemming of "significant" words - any stopped words that should be stemmed with them ?
if (!is.null(glbFeatsText)) {
require(tm)
glb_txt_stop_words[["descr.my"]] <- sort(c(NULL
### bid._sp
# , setdiff(removePunctuation(stopwords("english")), "no")
# ,"ac"
# # cor.y.train == NA
# ,unlist(strsplit(paste(c(NULL
# ,"128gb,1st,32gb,3g,64gb,90,acceptable,activation,amount,average,bad,buttons,buy,came,camera,can,care,carrier"
# #,casing
# ,"certified,charge,charging,cleaned,clear,come,components,contain,corner,correctly,covered,customer,earbuds"
# ,"engraved,engraving,engravement" # somehow didn't show up in the cor.y.train == NA list
# ,"entire,except,fair,features,feel,fine,generation,get,gift,got,heavily,heavy,however,imei,include,inspected,invisible,invisibleshield"
# ,"ipad,ipads"
# ,"issues"
# #,items,
# ,"keyboard,lightning,listing,little,looks,lower"
# ,"manufacture,manufacturer"# somehow didn't show up in the cor.y.train == NA list
# ,"meaning,model,near,need,needs,nicks,opened,operational,otherwise"
# ,"person,personal"# somehow didn't show up in the cor.y.train == NA list
# ,"phone,photos,pics,plastic,port,professionally"
# ,"purchased,purchasing"# somehow didn't show up in the cor.y.train == NA list
# ,"quality,questions,read,ready"
# ,"receive,received"# somehow didn't show up in the cor.y.train == NA list
# ,"removed,replaced,retail,return,returns,runs"
# #,scratch,
# ,"scuffing,sealed,sell,seller,selling,shape,ship,shown,silver,since,sold,sound,spacegray,stock,sync,tablet,taken,technician,tests,third,time,touch,units,unlocked,week,wifi,without"
# ,"wrap" # somehow didn't show up in the cor.y.train == NA list
# ,"zagg"
# ), collapse=",")
# , "[,]")) #err.abs.fit.sum=26.869473 w/o items,scratch
#
# # cor.y.abs is low
# #,"always","comes","grade","moderate","protector"
### bid._sp
### !_sp
# freq == 1; keep "gold"
,"09","17","28","34","360","5c","5point1point1","511","6428"
,"7point9","7point9in","79in"
,"8point5","8point25","82510"
,"910","9510","9point7","9point75","97510","99"
,"a1314","a1430","abused","accept","accounts","across","actuuly","add","advised"
,"affects","am","ans","antenna","anti","anyone","anything","applied","applying"
,"area","arizona","att","attached"
,"backlight","backlit","backplate","barley"
,"beetle","beginning","bend","besides","between"
,"bidder","binder","bonus","book","boot","bound","brick","broke","bruises"
,"buyers"
,"capacity","causing","cherished","chrome","classes","closely"
,"condtion","conditon","confidence","considerable","consist","consistent"
,"consumer","contents","contidion","control"
,"couldnt"
,"cream","customize","cuts"
,"daily","date","daughter"
,"deactivated","decent","deep","defender","defense","degree","demonstration"
,"depicted","depress","desciption"
,"difficulty","digi","disclaimer","discoloration","distressed","divider"
,"dlxnqat9g5wt","dock","documents","done","durable","dust","duty"
,"each","effect","either","emblem","erased","esi","essentially","etch","etched"
,"euc","every","exact","exhibition","expires"
,"facing","faded","faint","february","film","final","five","flickers"
,"folding","forgot","forwarders"
,"freezes","freight"
,"functioanlity"
,"games","generic","genuine","glitter","goes","gray","grey","guide"
,"hairline","half","handstand","hdmi","here","high","higher"
,"hold","hole","hospital"
,"immaculate","impact"
,"inivisible","instead"
,"intended","interest","interior","international","internationally"
,"into","intro"
,"ios","isnt","itself","ive"
,"jack","july"
,"keyword","kids","kind","knicks"
,"l","largest","last","late","length","let","letters","level"
,"lifting","limited","literally","literature"
,"local","logic","long","longer","looping","loose","loss","lost"
,"mars"
,"mb292ll","mc707ll","mc916ll","mc991ll","md789ll","mf432ll"
,"mic","middle","mind","minimum","mixed"
,"myself"
,"neither","newer","non","none","nonviewing","nor","november"
,"occasional","oem","often","online","oped","outside","over"
,"padfolio","pairing","paperwork","past","period","pet","piece"
,"plate","played","plug"
,"poor","portfolio","portion","pouch"
,"preinstalled","pressure","price","proof","provided"
,"ranging","rather"
,"real","realized","reassemble","reboot","receipt","recently","red"
,"reflected","refunds","relisting","remote","repeat","reponding"
,"required"
,"reserve","reshaped","residue","respond","restarts","result"
,"reviewed"
,"ringer","roughly","rubber"
,"said","same","school","scratchs","screeb"
,"seamlessly","seem","seen","semi","send","september","serious"
,"shell","shipment","short","showroom"
,"sighs","site","size"
,"sleeve","slice"
,"smoke","smooth","smudge","sn"
,"softer","software","somewhat","soon"
,"space","sparingly","sparkiling","spec","special","speck","speed","speigen"
,"stains","standup","start","status","stopped","strictly"
,"subtle","sustained"
,"swappadotcom","swiped","swivel"
,"take","technical","tempered","texture"
,"thank","then","therefore","think","those","though"
,"toddler","topback","totally","touchy","toys"
,"tried","turn","typical"
,"u","university","unknown","untouched","uppser"
,"valid","vary","viewing","virtually"
,"want","wavy","website","whole","why","winning","worn"
,"zaag","zero","zombie","zoogue"
# cor.y.train == NA
,"account","applecare","download","expect","fourth","greeting","maybe"
,"plus","purposes","significant","title","volume"
# chisq.pval high (e.g. == 1);
# keep
# carrier.fctr: "sprint", "verizon"
# cellular.fctr: "3g", "4g",wifion
# color.fctr: "gold"
# prdl.descr.my.fctr:
# storage.fctr: "128gb"
,"2016"
,"acceptable","actual","amount","awesome"
,"beautiful","before","bent","best","blocked","blocks"
,"capable","converted"
,"find"
,"gift"
,"handled","handling","headphone"
,"im","information"
,"love"
,"march","meaning","means","medium","money"
,"necessary"
,"offer","once"
,"page","product","products"
,"second","seconds","should","silver","skin","skinned"
,"tape","thoroughly","twice"
,"user"
,"way","which"
# nzv.freqRatio high (e.g. >= glb_nzv_freqCut);
# keep
# carrier.fctr: "sprint", "tmobile", "verizon"
# cellular.fctr: "3g", "4g",wifionly
# color.fctr: "gold",spacegray
# condition.fctr:
# levels:
# "Used", "For parts or not working", "Manufacturer refurbished", "New", "New other (see details)", "Seller refurbished"
# stemmed tokens:
# manufactur
# prdl.descr.my.fctr: "ipad1",ipad3,ipad4,ipadair2,ipadmini2
# storage.fctr: "128gb"
,"14","2015","3","30","4","5","6","7","8","9","90","9point5","9point7in"
,"a1432","able","about","ac","activate","activated","activation"
,"additional","adult"
,"after"
,"ago"
,"air"
,"along","already","also"
,"another","answer"
,"appears","approved","april"
,"around"
,"asis","associated"
,"auction"
,"backside","bad","battery","because","bestbuy","bezel","blue","bluetooth"
,"board","body","both","bottom","bought"
,"bright"
,"bumps","buy","buying"
,"came","camera","cameras","cannot","cant","card","carrier","cellular"
,"changed","changing","check","chip","chips"
,"color","colors","company","complete","completely","components"
,"connect","connected","connector","connects","contain","contains"
,"corporate","correctly","couple"
,"customer"
,"data"
,"dead"
,"default","defaulting","definite","definitely"
,"delivered","demo","describe","described","details"
,"do","does","dont","down"
,"drop","dropped","drops"
,"due"
,"earbuds","easily","ebay","else","engraved","engravement","engraving","entire"
,"etc"
,"even","ever","everything"
,"except","exterior","extremely"
,"fantastic","fast","faulty","features","feel","feels","fine","fix","fixed"
,"flaw","flaws"
,"frame","framing"
,"geeksquad","general","get","got","guarantee","guaranteed","guarantees"
,"hand","handful","handset","happened","hard","hardly","heavy","home","however"
,"id","if","images","imperfections"
,"inside","inspected","install","installed","instructions","invisible"
,"iphone","iphones"
,"issue","issues"
,"itunes"
,"keyboard","know","known"
,"latest","lcd","least","leather","less"
,"life","lightening","lightning","like","line","lining"
,"liquid","liquidation"
,"logo","lot","lots","lower"
,"magnetic","make","manual","manuals","many","me","memory","missing"
,"model","models","moderate","more","most","mostly"
,"multiple","musthave"
,"my"
,"name","network","networks","nice","nick","nicks"
,"nonfunctioning","noticeable","now"
,"off","old","operated","operational","otherwise","outer","own","owned"
,"party","passcode","password"
,"performance","performs","person","personal","personalized"
,"phone","physical","physically"
,"pin","pixels"
,"placed","places"
,"port","possible"
,"pretty","pristine","problem","problems","properly"
,"purchase","purchased","purchasing"
,"quality","question","questions"
,"rarely"
,"reactivated","really","rear","receive","received","regular","removed"
,"repair","retail","retina"
,"rotate","rotation"
,"running","runs"
,"s","sale","sales","sameday","scrapes","scroll","setup","several"
,"shattered","shrinkwrap","shut","shuts"
,"side","sides","sim","single"
,"so","something","sometimes","sound"
,"speaker","specifics","spent"
,"sticker","stickers","storage","store","storing","stuck","stylus"
,"super","supply","sure","surface"
,"sync"
,"taken","technician","than","these","they","thin","third","three"
,"till","tiny"
,"too","took","touch","touching","touchscreen"
,"turns"
,"two"
,"unable","under","unnoticeable","unopened","unsealed","until"
,"unused","unusable"
,"up","update","updated","updates","upgrade","upgraded","upper"
,"usage","usually"
,"verified","verify"
,"wall","water","we","week","weeks", "were","what","whats","where","while"
,"wiped","without"
,"would"
,"wrap","wrapped","wrong"
,"x"
,"year","years","your"
,"zagg","zaaginvisibleshield"
#
### !_sp
))
}
## Loading required package: tm
## Loading required package: NLP
##
## Attaching package: 'NLP'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txt_var]][grep("^2", glb_post_stem_words_terms_df_lst[[txt_var]]$term), ])
#glb_allobs_df[glb_post_stem_words_terms_mtrx_lst[[txt_var]][, 6] > 0, glbFeatsText]
# To identify terms with a specific freq
#paste0(sort(subset(glb_post_stop_words_terms_df_lst[[txt_var]], freq == 1)$term), collapse = ",")
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txt_var]], freq <= 2)$term), collapse = ",")
# To identify terms with a specific freq &
# are not stemmed together later OR is value of color.fctr (e.g. gold)
#paste0(sort(subset(glb_post_stop_words_terms_df_lst[[txt_var]], (freq == 1) & !(term %in% c("blacked","blemish","blocked","blocks","buying","cables","careful","carefully","changed","changing","chargers","cleanly","cleared","connect","connects","connected","contains","cosmetics","default","defaulting","defective","definitely","describe","described","devices","displays","drop","drops","engravement","excellant","excellently","feels","fix","flawlessly","frame","framing","gentle","gold","guarantee","guarantees","handled","handling","having","install","iphone","iphones","keeped","keeps","known","lights","line","lining","liquid","liquidation","looking","lots","manuals","manufacture","minis","most","mostly","network","networks","noted","opening","operated","performance","performs","person","personalized","photograph","physically","placed","places","powering","pre","previously","products","protection","purchasing","returned","rotate","rotation","running","sales","second","seconds","shipped","shuts","sides","skin","skinned","sticker","storing","thats","theres","touching","unusable","update","updates","upgrade","weeks","wrapped","verified","verify") ))$term), collapse = ",")
#print(subset(glb_post_stem_words_terms_df_lst[[txt_var]], (freq <= 2)))
#glb_allobs_df[which(terms_mtrx[, 229] > 0), glbFeatsText]
# To identify terms with cor.y == NA
#orderBy(~-freq+term, subset(glb_post_stop_words_terms_df_lst[[txt_var]], is.na(cor.y)))
#paste(sort(subset(glb_post_stop_words_terms_df_lst[[txt_var]], is.na(cor.y))[, "term"]), collapse=",")
#orderBy(~-freq+term, subset(glb_post_stem_words_terms_df_lst[[txt_var]], is.na(cor.y)))
# To identify terms with low cor.y.abs
#head(orderBy(~cor.y.abs+freq+term, subset(glb_post_stem_words_terms_df_lst[[txt_var]], !is.na(cor.y))), 5)
# To identify terms with high chisq.pval
#subset(glb_post_stem_words_terms_df_lst[[txt_var]], chisq.pval > 0.99)
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txt_var]], (chisq.pval > 0.99) & (freq <= 10))$term), collapse=",")
#paste0(sort(subset(glb_post_stem_words_terms_df_lst[[txt_var]], (chisq.pval > 0.9))$term), collapse=",")
#head(orderBy(~-chisq.pval+freq+term, glb_post_stem_words_terms_df_lst[[txt_var]]), 5)
#glb_allobs_df[glb_post_stem_words_terms_mtrx_lst[[txt_var]][, 68] > 0, glbFeatsText]
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txt_var]][grep("^m", glb_post_stem_words_terms_df_lst[[txt_var]]$term), ])
# To identify terms with high nzv.freqRatio
#summary(glb_post_stem_words_terms_df_lst[[txt_var]]$nzv.freqRatio)
#paste0(sort(setdiff(subset(glb_post_stem_words_terms_df_lst[[txt_var]], (nzv.freqRatio >= glb_nzv_freqCut) & (freq < 10) & (chisq.pval >= 0.05))$term, c( "128gb","3g","4g","gold","ipad1","ipad3","ipad4","ipadair2","ipadmini2","manufactur","spacegray","sprint","tmobil","verizon","wifion"))), collapse=",")
# To identify obs with a txt term
#tail(orderBy(~-freq+term, glb_post_stop_words_terms_df_lst[[txt_var]]), 20)
#mydsp_obs(list(descr.my.contains="non"), cols=c("color", "carrier", "cellular", "storage"))
#grep("ever", dimnames(terms_stop_mtrx)$Terms)
#which(terms_stop_mtrx[, grep("ipad", dimnames(terms_stop_mtrx)$Terms)] > 0)
#glb_allobs_df[which(terms_stop_mtrx[, grep("16", dimnames(terms_stop_mtrx)$Terms)[1]] > 0), c(glb_category_var, "storage", txt_var)]
# To identify whether terms shd be synonyms
#orderBy(~term, glb_post_stop_words_terms_df_lst[[txt_var]][grep("^moder", glb_post_stop_words_terms_df_lst[[txt_var]]$term), ])
# term_row_df <- glb_post_stop_words_terms_df_lst[[txt_var]][grep("^came$", glb_post_stop_words_terms_df_lst[[txt_var]]$term), ]
#
# cor(glb_post_stop_words_terms_mtrx_lst[[txt_var]][glb_allobs_df$.lcn == "Fit", term_row_df$pos], glb_trnobs_df[, glb_rsp_var], use="pairwise.complete.obs")
# To identify which stopped words are "close" to a txt term
#sort(cluster_vars)
# Text Processing Step: stemDocument
# To identify stemmed txt terms
#glb_post_stop_words_terms_df_lst[[txt_var]][grep("condit", glb_post_stop_words_terms_df_lst[[txt_var]]$term), ]
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txt_var]][grep("^con", glb_post_stem_words_terms_df_lst[[txt_var]]$term), ])
#glb_allobs_df[which(terms_stem_mtrx[, grep("use", dimnames(terms_stem_mtrx)$Terms)[[1]]] > 0), c(glb_id_var, "productline", txt_var)]
#glb_allobs_df[which(TfIdf_stem_mtrx[, 191] > 0), c(glb_id_var, glb_category_var, txt_var)]
#which(glb_allobs_df$UniqueID %in% c(11915, 11926, 12198))
# Text Processing Step: mycombineSynonyms
# To identify which terms are associated with not -> combine "could not" & "couldn't"
#findAssocs(glb_full_DTM_lst[[txt_var]], "not", 0.05)
# To identify which synonyms should be combined
#orderBy(~term, glb_post_stem_words_terms_df_lst[[txt_var]][grep("^c", glb_post_stem_words_terms_df_lst[[txt_var]]$term), ])
chk_comb_cor <- function(syn_lst) {
# cor(terms_stem_mtrx[glb_allobs_df$.src == "Train", grep("^(damag|dent|ding)$", dimnames(terms_stem_mtrx)[[2]])], glb_trnobs_df[, glb_rsp_var], use="pairwise.complete.obs")
print(subset(glb_post_stem_words_terms_df_lst[[txt_var]], term %in% syn_lst$syns))
print(subset(get_corpus_terms(tm_map(glb_txt_corpus_lst[[txt_var]], mycombineSynonyms, list(syn_lst), lazy=FALSE)), term == syn_lst$word))
# cor(terms_stop_mtrx[glb_allobs_df$.src == "Train", grep("^(damage|dent|ding)$", dimnames(terms_stop_mtrx)[[2]])], glb_trnobs_df[, glb_rsp_var], use="pairwise.complete.obs")
# cor(rowSums(terms_stop_mtrx[glb_allobs_df$.src == "Train", grep("^(damage|dent|ding)$", dimnames(terms_stop_mtrx)[[2]])]), glb_trnobs_df[, glb_rsp_var], use="pairwise.complete.obs")
}
#chk_comb_cor(syn_lst=list(word="cabl", syns=c("cabl", "cord")))
#chk_comb_cor(syn_lst=list(word="damag", syns=c("damag", "dent", "ding")))
#chk_comb_cor(syn_lst=list(word="dent", syns=c("dent", "ding")))
#chk_comb_cor(syn_lst=list(word="use", syns=c("use", "usag")))
glb_txt_synonyms <- list()
glb_txt_synonyms[["descr.my"]] <- #NULL : default
list(NULL
### bid._sp
# , list(word="cabl", syns=c("cabl", "cord"))#err.abs.fit.sum=26.863220
# # , list(word="charger", syns=c("charg", "charger"))
# # , list(word="come", syns=c("came", "come"))
# # , list(word="dent", syns=c("dent", "ding"))
# # , list(word="damag", syns=c(#"bad", "blemish", "broken", "crack",
# # #defect has +ve cor, others have -ve cor
# # "damag", "dent", "ding",
# # #"scratch", "scuff", "tear", "wear",
# # NULL))
# # # combining damag with defect & dent results in higher err.abs.fit.sum=26.885899
# # # combining defect with dent in higher err.abs.fit.sum=26.894976
# # , list(word="defect", syns=c(#"bad", "blemish", "broken", "crack",
# # "defect", "dent", #"ding", ding has -ve cor, others have +ve cor
# # #"scratch", "scuff", "tear", "wear",
# # NULL))
# #, list(word="new", syns=c("brand")) ???
# # , list(word="scuff", syns=c("scuf", "scuff"))
# # , list(word="show", syns=c("show", "shown"))
# # , list(word="tablet", syns=c("tab", "tablet"))
### bid._sp
### !_sp
,list(word = "cabl", syns = c("cabl", "cord"))
,list(word = "cant", syns = c("cant", "cannot"))
,list(word = "descript", syns = c("descript", "discript"))
,list(word = "generat", syns = c("gen", "generat"))
,list(word = "ipadmini", syns = c("ipadmini", "mini"))
,list(word = "kept", syns = c("keep", "kept"))
,list(word = "know", syns = c("know", "known"))
,list(word = "lightn", syns = c("lighten", "lightn"))
,list(word = "passcod", syns = c("passcod", "password"))
,list(word = "photo", syns = c("photo", "photograph", "photos", "pic", "pictur"))
,list(word = "preown", syns = c("pre", "preown", "previous",
"previouslyus", "previouslyown"))
,list(word = "protector", syns = c("protect", "protector"))
,list(word = "scuff", syns = c("scuf", "scuff"))
,list(word = "tablet", syns = c("tab", "tablet"))
,list(word = "with", syns = c("w", "with"))
,list(word = "zagg", syns = c("zagg", "zaaginvisibleshield"))
### !_sp
)
if (length(glb_txt_synonyms) > 0) names(glb_txt_synonyms) <- glbFeatsText
# Text Processing Step: filterTerms
if (!is.null(glbFeatsText)) {
require(tm)
# options include: weightTf, myweightTflog1p, myweightTfsqrt, weightTfIdf, weightBM25
glb_txt_terms_control <- list(weighting = weightTfIdf # : default
# termFreq selection criteria across obs: default: list(global=c(1, Inf))
, bounds = list(global = c(1, Inf)) # bid._sp: list(global=c(3, Inf))
# default: c(3, Inf)
, wordLengths = c(1, Inf) # bid._sp: c(2, Inf)
)
}
glb_txt_cor_var <- glb_rsp_var # bid._sp: "startprice.log10.cut.fctr" # default: glb_rsp_var
# select one from c("union.top.val.cor", "top.cor", default: "top.val", "top.chisq", "sparse")
glbFeatsTextFilter <- "top.chisq"
glbFeatsTextTermsMax <- c(15) # bid._sp: c(20) # c(50) in (old) !_sp # default: rep(10, length(glbFeatsText))
names(glbFeatsTextTermsMax) <- glbFeatsText
# Text Processing Step: extractAssoc
glbFeatsTextAssocCor <- c(0.45) #bid._sp: c(0.4) #(old) !_sp: 0.2 #default: rep(1, length(glbFeatsText))
names(glbFeatsTextAssocCor) <- glbFeatsText
# Text Processing Step: extractPatterns (ngrams)
# Potential Enhancements
# "Seller refurbished" -> D.P.refurbished.seller ?
# "Like new" -> D.P.new.like ?
# "No scratches" -> D.P.scratch.no ?
glb_important_terms <- list()
# Remember to use stemmed terms
# Have to set it even if it is not used
glb_sprs_thresholds <- c(0.950) # Generates 8 terms
# Properties:
# numrows(glb_feats_df) << numrows(glb_fitobs_df)
# Select terms that appear in at least 0.2 * O(FP/FN(glb_OOBobs_df))
# numrows(glb_OOBobs_df) = 1.1 * numrows(glb_newobs_df)
names(glb_sprs_thresholds) <- glbFeatsText
if (glb_rsp_var_raw != glb_rsp_var)
glbFeatsExclude <- union(glbFeatsExclude,
glb_rsp_var_raw)
glbFctrMaxUniqVals <- 23 # default: 20
glb_impute_na_data <- FALSE # or TRUE
glb_mice_complete.seed <- 144 # or any integer
glb_cluster <- TRUE # bid._sp:TRUE # default:FALSE
glb_cluster.seed <- 189 # or any integer
### !_sp
glb_cluster_entropy_var <- glb_rsp_var # c(glb_rsp_var, as.factor(cut(glb_rsp_var, 3)), default: NULL)
glbFeatsTextClusterVarsExclude <- FALSE # default FALSE
### bid._sp
# glb_cluster_entropy_var <- "sold" #"startprice.log10.cut.fctr"
# glbFeatsTextClusterVarsExclude <- TRUE # default FALSE
### bid._sp
glb_interaction_only_feats_lst <- list()
### bid._sp
glb_interaction_only_feats_lst[["carrier.fctr"]] <- "cellular.fctr"
glb_nzv_freqCut <- 19 # 19 is caret default
glb_nzv_uniqueCut <- 10 # 4 : bid._sp # 10 : caret::default
### bid._sp
glb_rfe_fit_sizes <- # NULL
### bid0_sp
# c(106, 111, 116, 120, 128) # or NULL c(8, 16, 32, 64, 128, 140)
### bid1_sp
# c(8, 11, 16, 21, 32, 64, 128)
c(47,48,49,50,51,52,53,54)
# outliers identified by car::outlierTest
glb_obsfit_outliers <- #NULL
### bid._sp
# c(NULL # default: NULL
# biddable == 0 & 1; err.abs.fit.sum=423.55172
# # outliers
# , 10813 # next 665 w/ rstudent=-5.091080; biddable=3.263257; err.abs.fit.sum=418.598755
# , 10666 # next 1727 w/ rstudent=-5.163517; biddable=4.293465; err.abs.fit.sum=414.093609
# , 11736 # next 780 w/ rstudent=-5.181343; biddable=5.670483; err.abs.fit.sum=401.817992
# # old biddable importance above this
# , 10781 # next 1323 w/ rstudent=-5.151062; biddable=13.30602; err.abs.fit.sum=396.393721
# #, 10091 # next 91 w/ rstudent=-4.444452; biddable=; err.abs.fit.sum=402.673715 (up)
# #, 10166 # next 560 w/ rstudent=-5.006795; biddable=; err.abs.fit.sum=401.759324 (up)
# #, 10281 # next 281 w/ rstudent=-4.245087; biddable=; err.abs.fit.sum=401.316926 (up)
# #, 10285 # next 285 w/ rstudent=-4.483190; biddable=; err.abs.fit.sum=402.608936 (up)
# #, 10446 # next 445 w/ rstudent=-4.663418; biddable=; err.abs.fit.sum=403.074523 (up)
# #, 10542 # next 1323 w/ rstudent=-5.214517; biddable=; err.abs.fit.sum=401.04205 (up)
# #, 10543 # next 1323 w/ rstudent=-5.214517; biddable=; err.abs.fit.sum=401.04205 (up)
# #, 10561 # next 542 w/ rstudent=-4.736154; biddable=; err.abs.fit.sum=401.56198 (up)
# #, 10631 # next 166 w/ rstudent=-5.073048; biddable=; err.abs.fit.sum=401.556788 (up)
# #, 11330 # next 630 w/ rstudent=-5.117659; biddable=; err.abs.fit.sum=401.732597 (up)
# , 10091, 10166, 10281, 10285, 10446, 10542, 10543, 10561, 10631, 11330
# # biddable=18.93923; err.abs.fit.sum=359.388769
# , 10330 #biddable=19.06084; err.abs.fit.sum=355.895702
# , 10402 #biddable= 0.0 ; err.abs.fit.sum=351.315181
# , 10438 #biddable= 0.0 ; err.abs.fit.sum=347.821527
# , 10624 #biddable= 0.0 ; err.abs.fit.sum=343.724904
# , 10659 #biddable= 0.0 ; err.abs.fit.sum=331.873603
# , 11323 #biddable=10.45901; err.abs.fit.sum=324.929562
# , 11422 #biddable= 0.0 ; err.abs.fit.sum=334.839805 (up)
# biddable == 0; err.abs.fit.sum=26.713317
# , 11448 # outliers; next is 858 w/ rstudent=-5.855132; err.abs.fit.sum=24.212800
# , 11583 # outliers; next is 856 w/ rstudent=-4.792849; err.abs.fit.sum=22.164035
# , 11581 # outliers; next is 743 w/ rstudent=-4.005054; err.abs.fit.sum=18.842901
# , 10837 # outliers; next is 336 w/ rstudent=-5.279215; err.abs.fit.sum=18.124560
# , 11442 # outliers; next is 904 w/ rstudent=-4.474844; err.abs.fit.sum=15.533211
# , 11697 # outliers; next is 874 w/ rstudent=-3.678664; err.abs.fit.sum=13.829375
# , 10799 # .hatvalues == 1; total 8; iPadmini#1; err.abs.fit.sum=13.807283
# , 10017 # .hatvalues == 1; total 7; iPad3#1; err.abs.fit.sum=14.620782 (up)
# , 10027, 10859 # .hatvalues == 1; total 7; iPad1#1; err.abs.fit.sum=14.570246 (up)
# , 10332 # .hatvalues == 1; total 7; iPad4#1; err.abs.fit.sum=13.706467
# , 11759 # .hatvalues == 1; total 6; iPadAir2#1; err.abs.fit.sum=13.643043
# , 10675 # .hatvalues == 1; total 5; iPadAir#1; err.abs.fit.sum=13.623787
# , 11119 # .hatvalues == 1; total 4; iPadmini3#1; err.abs.fit.sum=NA
# , 10017, 10027, 10859 # .hatvalues == 1; total 1; iPad3#1 & iPad1#1; err.abs.fit.sum=13.438903
# biddable == 1; err.abs.fit.sum=361.78243
# , 10813 # outliers; next is 665 w/ rstudent=-5.021180; err.abs.fit.sum=356.83424
# , 10666 # outliers; next is 808 w/ rstudent=-4.764126; err.abs.fit.sum=352.46437
# , 11736 # outliers; next is 665 w/ rstudent=-4.614022; err.abs.fit.sum=348.59977
# , 10542 # outliers; next is 665 w/ rstudent=-4.654923; err.abs.fit.sum=344.18546
# , 11330 # outliers; next is 327 w/ rstudent=-4.628972; err.abs.fit.sum=336.12636
# , 10561 # outliers; next is 56 w/ rstudent=-4.612970; err.abs.fit.sum=329.50309
# , 10166 # outliers; next is 318 w/ rstudent=-4.717238; err.abs.fit.sum=318.50562
# , 10543 # outliers; next is 464 w/ rstudent=-4.811116; err.abs.fit.sum=314.32801
# , 10285 # outliers; next is 21 w/ rstudent=-4.850822; err.abs.fit.sum=310.19008
# #, 10091 # outliers; next is 464 w/ rstudent=-4.941448; err.abs.fit.sum=312.94069 (up)
# #, 10781 # outliers; next is 250 w/ rstudent=-4.793502; err.abs.fit.sum=313.03867 (up)
# , 10446 # outliers; next is 371 w/ rstudent=-4.787578; err.abs.fit.sum=307.15681
# , 10631 # outliers; next is 165 w/ rstudent=-4.130356; err.abs.fit.sum=303.34549
# #, 10330 # outliers; next is 217 w/ rstudent=-4.067684; err.abs.fit.sum=312.75121 (up)
# #, 10402 # outliers; next is 388 w/ rstudent=-4.067684; err.abs.fit.sum=311.84516 (up)
# #, 10659 # outliers; next is 128 w/ rstudent=-3.982911; err.abs.fit.sum=311.84516 (up)
# , 10091, 10781, 10330, 10402, 10659#, 10281 outliers; err.abs.fit.sum=282.381827; iPad4#0=13.806011; iPad4#1=7.799398
# #, 10281 # outliers; next is NA w/ rstudent=NA; err.abs.fit.sum=287.147331 (up); iPad4#0=14.372770; iPad4#1=4.591408
# #, 10624 # outliers; ignored along with 10281 err.abs.fit.sum=289.116467 (up); iPad4#0=; iPad4#1=
# #, 10624 # outliers; ignored w/o 10281 err.abs.fit.sum=286.415040 (up); iPad4#0=; iPad4#1=
# #, 10636 # hatvalues==1; next is 11652; err.abs.fit.sum=290.50254 (up)
# , 11652 # hatvalues==1; next is 10636; err.abs.fit.sum=282.183867
# #err.abs.fit.sum=282.227249
# )
### bid._sp
# .hatvalues == 1
c(10096, 11062, 11233, 11660, 11768)
# influence.measures: car::outlier; rstudent; dffits; hatvalues; dfbeta; dfbetas
#print(outliers <- car::outlierTest(glb_models_lst[["RFE.X.glm"]]$finalModel))
#print(outliers_df <- data.frame(.Bonf.p=outliers$bonf.p))
#obs_df <- glb_trnobs_df; mdlId <- "RFE.X.Train.glm"
#model_diags_df <- cbind(obs_df, data.frame(.rstudent=stats::rstudent(glb_models_lst[[mdlId]]$finalModel)), data.frame(.dffits=stats::dffits(glb_models_lst[[mdlId]]$finalModel)), data.frame(.hatvalues=stats::hatvalues(glb_models_lst[[mdlId]]$finalModel)))
#print(summary(model_diags_df[, c(".rstudent",".dffits",".hatvalues")]))
#print(subset(model_diags_df, is.na(.dffits)))
#print(subset(model_diags_df, .hatvalues == 1))
#dffits_df <- merge(dffits_df, outliers_df, by="row.names", all.x=TRUE); row.names(dffits_df) <- dffits_df$Row.names; dffits_df <- subset(dffits_df, select=-Row.names)
#dffits_df <- merge(dffits_df, glb_fitobs_df, by="row.names", all.x=TRUE); row.names(dffits_df) <- dffits_df$Row.names; dffits_df <- subset(dffits_df, select=-Row.names)
#subset(dffits_df, !is.na(.Bonf.p))
#indep_vars <- myextract_actual_feats(row.names(orderBy(reformulate(c("-", paste0(glb_sel_mdl_id, ".importance"))), glb_featsimp_df))); indep_vars <- indep_vars[!grepl(".fctr", indep_vars, fixed=TRUE)]
#myplot_parcoord(obs_df=model_diags_df[, c(glb_id_var, glb_category_var, ".rstudent", ".dffits", ".hatvalues", glb_rsp_var, "startprice.log10.predict.RFE.X.glmnet", indep_vars[1:20])], obs_ix=row.names(model_diags_df) %in% names(outliers$rstudent)[1], id_var=glb_id_var, category_var=glb_category_var)
#model_diags_df[row.names(model_diags_df) %in% names(outliers$rstudent)[c(1:2)], ]
#ctgry_diags_df <- model_diags_df[model_diags_df[, glb_category_var] %in% c("Unknown#0"), ]
#myplot_parcoord(obs_df=ctgry_diags_df[, c(glb_id_var, glb_category_var, ".rstudent", ".dffits", ".hatvalues", glb_rsp_var, "startprice.log10.predict.RFE.X.glmnet", indep_vars[1:20])], obs_ix=row.names(ctgry_diags_df) %in% names(outliers$rstudent)[1], id_var=glb_id_var, category_var=glb_category_var)
#table(glb_fitobs_df[model_diags_df[, glb_category_var] %in% c("iPad1#1"), "startprice.log10.cut.fctr"])
#glb_fitobs_df[model_diags_df[, glb_category_var] %in% c("iPad1#1"), c(glb_id_var, "startprice")]
# No outliers & .dffits == NaN
#myplot_parcoord(obs_df=model_diags_df[, c(glb_id_var, glb_category_var, glb_rsp_var, "startprice.log10.predict.RFE.X.glmnet", indep_vars[1:10])], obs_ix=seq(1:nrow(model_diags_df))[is.na(model_diags_df$.dffits)], id_var=glb_id_var, category_var=glb_category_var)
#dffits_ctgry_df <- subset(dffits_df, prdl.descr.my.fctr %in% c("Unknown#0"))
#myplot_parcoord(obs_df=dffits_ctgry_df[, c(glb_id_var, glb_category_var, ".dffits", ".Bonf.p", glb_rsp_var, "startprice.log10.predict.RFE.X.glmnet", indep_vars[1:5])], obs_ix=seq(1:nrow(dffits_ctgry_df))[!is.na(dffits_ctgry_df$.Bonf.p)], id_var=glb_id_var, category_var=glb_category_var)
#
#car::influenceIndexPlot(glb_models_lst[["RFE.X.glm"]]$finalModel, id.n=3)
# myplot_parcoord(obs_df=glb_fitobs_df[, c(glb_id_var, glb_rsp_var,
# "startprice.log10.predict.RFE.X.glmnet",
# indep_vars[1:5])], obs_ix=hatobs_ix, id_var=glb_id_var)
# myplot_parcoord(x=glb_fitobs_df[, c(glb_rsp_var, "startprice.log10.predict.RFE.X.glmnet",
# indep_vars[1:2])], obs_ix=hatobs_ix)
# hatvals <- hatvalues(glb_models_lst[["RFE.X.glm"]]$finalModel)
# hatobs_ix <- which(hatvals == max(hatvals))
# MASS::parcoord(x=glb_fitobs_df[, c(glb_rsp_var, "startprice.log10.predict.RFE.X.glmnet",
# indep_vars[1:2])], var.label=TRUE)
#plot(hatvalues(glb_models_lst[["RFE.X.glm"]]$finalModel), type = "h")
#glb_fitobs_df[which(row.names(glb_fitobs_df) %in% c("972")), c(glb_id_var, glb_rsp_var, glb_rsp_var_raw, "sold", glb_category_var)]
#all.equal(glb_models_lst[[glb_sel_mdl_id]], glb_models_lst[[glb_fin_mdl_id]])
glb_obstrn_outliers <- c(glb_obsfit_outliers
)
#car::outlierTest(glb_models_lst[["RFE.X.glm"]]$finalModel)
#glb_trnobs_df[which(row.names(glb_fitobs_df) %in% c("972")), c(glb_id_var, glb_rsp_var, glb_rsp_var_raw, "sold", glb_category_var)]
glb_models_lst <- list(); glb_models_df <- data.frame()
# Regression
if (glb_is_regression) {
glbMdlMethods <- c(NULL
# deterministic
#, "lm",
, "glm"
#, "bayesglm" # crashing w/ parallel processing
, "glmnet", "rpart"
# non-deterministic
, "gbm", "rf"
# Unknown
#, "nnet" , "avNNet" # predicts 1 for all obs in bid0_sp # runs 25 models per cv sample for tunelength=5
, "svmLinear", "svmLinear2"
#, "svmPoly" # crashing w/ parallel processing #, "svmPoly" runs 75 models per cv sample for tunelength=5
#, "svmRadial" # crashing w/ parallel processing
, "earth"
#, "bagEarth" # Takes a long time
)
} else
# Classification - Add ada,bagEarth (auto feature selection)
if (glb_is_binomial)
glbMdlMethods <- c(NULL
# deterministic
, "glm"
, "bayesglm" # crashing w/ parallel processing
, "glmnet", "rpart"
# non-deterministic
, "gbm", "rf"
# Unknown
, "nnet" , "avNNet" # runs 25 models per cv sample for tunelength=5
#, "svmLinear" # crashing with interaction variables
#, "svmLinear2" # crashing with interaction variables
#, "svmPoly" # crashing with interaction variables #, "svmPoly" runs 75 models per cv sample for tunelength=5
#, "svmRadial" # crashing with interaction variables
, "earth"
#, "bagEarth" # Takes a long time
) else
glbMdlMethods <- c("rpart", "rf", "gbm")
glb_mdl_family_lst <- list(); glb_mdl_feats_lst <- list()
# family: Choose from c("RFE.X", "CSM.X", "All.X", "Best.Interact")
# methods: Choose from c(NULL, <method>, glbMdlMethods)
glb_mdl_family_lst[["RFE.X"]] <- glbMdlMethods # non-NULL list is mandatory
glb_mdl_family_lst[["All.X"]] <- "glmnet" # non-NULL list is mandatory
glb_mdl_family_lst[["Best.Interact"]] <- "glmnet" # non-NULL list is mandatory
### bid1_sp
# glb_mdl_family_lst[["CSM.X"]] <- "glmnet"
# glb_mdl_feats_lst[["CSM.X"]] <- c(NULL
# # from RFE.X
# , "startprice.dgt1.is9", "startprice.dcm2.is9", "startprice.dcm1.is9", "startprice.dgt2.is9"
# #, "condition.fctr"
# , "prdl.descr.my.fctr", "color.fctr"
# #, "D.ratio.weight.sum.wrds.n"
# , "cellular.fctr", "cellular.fctr:carrier.fctr"
#
# # from RFE.X.Interact
# , "cellular.fctr:prdl.descr.my.fctr", "cellular.fctr:startprice.dgt2.is9", "cellular.fctr:startprice.dgt1.is9", "cellular.fctr:color.fctr"
# , "cellular.fctr:condition.fctr" # RMSE up with keeping condition.fctr in the model
# # RMSE & R.sq up with removing condition.fctr from the model
# , "cellular.fctr:D.ratio.weight.sum.wrds.n"
# )
### bid1_sp
### !_sp
# glb_mdl_family_lst[["CSM.X"]] <- "glmnet"
# glb_mdl_feats_lst[["CSM.X"]] <- c(NULL
# , "D.ratio.sum.TfIdf.wrds.n", "D.TfIdf.sum.stem.stop.Ratio",
# , "D.npnct15.log", "D.npnct03.log", "D.wrds.n.log", "D.chrs.n.log")
# indep_vars <- union(setdiff(indep_vars, interact_vars_vctr),
# paste(glb_category_var, interact_vars_vctr,
# sep=ifelse(grepl("\\.fctr", glb_category_var), "*", ".fctr*")))
# indep_vars <- union(setdiff(indep_vars,
# c("startprice.log.diff", "startprice.unit9", "biddable", "cellular.fctr", "carrier.fctr")),
# c("startprice.log.diff*biddable", "startprice.unit9*biddable", "cellular.fctr*carrier.fctr"))
### !_sp
# Check if interaction features make fit better
# Check if tuning parameters make fit better
glb_tune_models_df <- data.frame()
#RFE.X.avNNet
### bid0_sp
# size=[1] 3 5 7 9; decay=[0] 1e-04 0.001 0.01 0.1; bag=[FALSE]; RMSE=1.3300906
### bid1_sp
# size=[1] 3 5 7 9; decay=0 0.0001 [0.001] 0.01 0.1; bag=[FALSE]; RMSE=0.9285472
### bid0&1_sp
#RFE.X.bagEarth
### bid0_sp
#RFE.X.bagEarth degree=[1]; nprune=[33]; RMSE=0.1507259
### bid1_sp
#RFE.X.bagEarth degree=[1]; nprune=[32]; RMSE=0.6379639
#RFE.X.bagEarth degree=[1] 2 3; nprune=8 16 32 64 [128]; RMSE=0.6334405
#RFE.X.bagEarth degree=1 [2]; nprune=16 32 64 128 [256]; RMSE=0.6211924
#RFE.X.bagEarth degree=1 [2]; nprune=64 128 200 225 [256]; RMSE=0.6320776 (up)
#RFE.X.bagEarth degree=[1] 2; nprune=64 128 225 256 [275]; RMSE=0.640644 (up)
#RFE.X.bagEarth degree=1 [2] 3; nprune=64 128 200 [256] 300; RMSE=0.6496039 (up)
#RFE.X.bagEarth degree=1 [2] 3; nprune=32 64 128 256 [512]; RMSE=0.6404529 (up)
#RFE.X.bagEarth degree=1 [2] 3; nprune=64 128 256 512 [1024]; RMSE=0.6486663 (up)
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "bagEarth", parameter = "nprune", vals = "256")
# ,data.frame(method = "bagEarth", parameter = "degree", vals = "2")
# ))
### bid0&1_sp
### bid0_sp
#RFE.X.earth degree=[1]; nprune=2 [9] 17 25 33; RMSE=0.1334478
### bid0_sp
#RFE.X.gbm
### bid0_sp
# shrinkage=[0.1]; n.trees=50 100 150 [200] 250; RMSE=0.2062651
# shrinkage=0.00 0.05 0.10 0.15 [0.20]; n.trees=50 [100] 150 200 250; interaction.depth=1 [2] 3 4 5; n.minobsinnode=[10]; RMSE=0.2019453
# shrinkage=0.00 0.05 [0.10] 0.15 0.20; n.trees=50 100 150 200 [250]; interaction.depth=[1] 2 3 4 5; n.minobsinnode=[10]; RMSE=0.2008313
# shrinkage=0.05 [0.10] 0.15 0.20 0.25; n.trees=100 150 200 [250] 300; interaction.depth=[1] 2 3 4 5; n.minobsinnode=[10]; RMSE=0.2008313
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "gbm", parameter = "shrinkage", min = 0.05, max = 0.25, by = 0.05)
# ,data.frame(method = "gbm", parameter = "n.trees", min = 100, max = 300, by = 50)
# ,data.frame(method = "gbm", parameter = "interaction.depth", min = 1, max = 5, by = 1)
# ,data.frame(method = "gbm", parameter = "n.minobsinnode", min = 10, max = 10, by = 10)
# #seq(from=0.05, to=0.25, by=0.05)
# ))
### bid1_sp
# shrinkage=[0.1]; n.trees=50 100 150 200 [250]; interaction.depth=1 2 3 4 [5]; n.minobsinnode=[10]; RMSE=0.5054172
# shrinkage=0.03 [0.04] 0.05 0.06 0.07; n.trees=100 [150] 200 250 300; interaction.depth=2 3 4 5 [6]; n.minobsinnode=6 [8] 10 12 14; RMSE=0.5036430
# shrinkage=0.03 [0.04] 0.05 0.06 0.07; n.trees=100 150 [200] 250 300; interaction.depth=3 4 5 [6] 7; n.minobsinnode=6 8 [10] 12 14; RMSE=0.502774
# shrinkage=0.04; n.trees=200; interaction.depth=6; n.minobsinnode=10; RMSE=0.502774
# shrinkage=[0.05] 0.10 0.15 0.20 0.25; n.trees=100 [150] 200 250 300; interaction.depth=2 3 [4] 5 6; n.minobsinnode=[10]; RMSE=0.5058678 (up)
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "gbm", parameter = "shrinkage", vals = "0.04")
# ,data.frame(method = "gbm", parameter = "n.trees", vals = "200")
# ,data.frame(method = "gbm", parameter = "interaction.depth", vals = "6")
# ,data.frame(method = "gbm", parameter = "n.minobsinnode", vals = "10")
# ))
### bid0&1_sp
#RFE.X.glmnet
### bid1_sp
# alpha=0.100 [0.325] 0.550 0.775 1.000; lambda=0.0005232693 0.0024288010 0.0112734954 [0.0523269304] 0.2428800957; RMSE=0.6164891
### bid1_sp
#RFE.X.nnet
### bid0_sp
# size=[1] 3 5 7 9; decay=[0] 1e-04 0.001 0.01 0.1; RMSE=1.3300906
### bid1_sp
# size=1 3 5 7 [9]; decay=0e+00 1e-04 1e-03 1e-02 [1e-01]; RMSE=0.9289109
# size=3 5 [7] 9 11; decay=0.0001 0.001 0.01 [0.1] 0.2; RMSE=0.9287422
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "nnet", parameter = "size", vals = "3 5 7 9 11")
# ,data.frame(method = "nnet", parameter = "decay", vals = "0.0001 0.0010 0.0100 0.1000 0.2000")
# ))
### bid0&1_sp
#RFE.X.rf
### bid0_sp
# mtry=2 35 [68] 101 134; RMSE=0.1331992
# mtry=2 35 68 [101] 134; RMSE=0.1339974
### bid0_sp
# mtry=2 41 81 121 [161]; Accuracy=0.8398314
# mtry=41 81 [121] 161 181; Accuracy=0.8282403
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "rf", parameter = "mtry", vals = "41 81 121 141 161")
# ))
#RFE.X.rpart
### bid0_sp
# cp=[0.03230142] 0.06012801 0.09395662 0.12251081 0.35258370; RMSE=0.1771138
# cp=0.020 [0.025] 0.030 0.035 0.040; RMSE=0.1770237
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method="rpart", parameter="cp", min=0.02, max=0.04, by=0.005)
# ))
### bid1_sp
# cp=[0.008081388] 0.016191995 0.027590245 0.299848193 0.361621486; RMSE=0.5294398
# cp=[0.005] 0.006 0.007 0.008 0.009 0.010; RMSE=0.522678
# cp=0.001 [0.003] 0.005 0.007 0.009; RMSE=0.5186586
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method="rpart", parameter="cp", min=0.001, max=0.010, by=0.002)
# ))
### bid0&1_sp
#RFE.X.svmLinear
### bid0_sp
# C=[1]; RMSE=0.1374094
# C=1e-02 [0.1] 5e-01 1e+00 2e+00 3e+00 4e+00 1e+01 1e+02; RMSE=0.1271318
# C=0.01 0.05 [0.10] 0.50 1.00 2.00 3.00 4.00; RMSE=0.1271318; 0.1296718
### bid1_sp
# C=[1]; RMSE=0.6614060
# C=1e-02 [1e-01] 1e+00 1e+01 1e+02; RMSE=0.6373977
# C=[0.05] 0.10 0.50 1.00 10.00; RMSE=0.6324697
# C=0.01 [0.05] 0.10 0.50 1.00; RMSE=0.6324697
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "svmLinear", parameter = "C", vals = "0.01 0.05 0.1 0.5 1")
# ))
### bid0&1_sp
#RFE.X.svmLinear2
### bid0_sp
# cost=[0.25] 0.50 1.00 2.00 4.00; RMSE=0.1276354
# cost=0.0625 0.1250 [0.25] 0.50 1.00; RMSE=0.1276354
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "svmLinear2", parameter = "cost", vals = "0.0625 0.125 0.25 0.5 1")
### bid1_sp
# cost=[0.25] 0.50 1.00 2.00 4.00; RMSE=0.6483622
# cost=[0.0625] 0.1250 0.25 0.50 1.00; RMSE=0.6335311
# cost=0.0312 [0.0625] 0.1250 0.25 0.50; RMSE=0.6335311
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method = "svmLinear2", parameter = "cost", vals = "0.0312 0.0625 0.125 0.25 0.50")
# ))
### bid0&1_sp
#RFE.X.svmPoly
### bid0_sp
# degree=[1] 2 3; scale=0.001 0.01 [0.1] 1 10; C=0.25 0.50 1.00 [2.00] 4.00; RMSE=0.1276130
# degree=[1] 2 3 4 5; scale=0.01 0.05 [0.1] 0.5 1; C=0.50 1.00 [2.00] 3.00 4.00; RMSE=0.1276130
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method="svmPoly", parameter="degree", min=1, max=5, by=1) #seq(1, 5, 1)
# ))
# glb_tune_models_df <- myrbind_df(glb_tune_models_df, rbind(data.frame()
# ,data.frame(method="svmPoly", parameter="scale", vals="0.01, 0.05, 0.1, 0.5, 1")
# ,data.frame(method="svmPoly", parameter="C", vals="0.50, 1.00, 2.00, 3.00, 4.00")
# ))
### bid0_sp
#RFE.X.svmRadial
### bid0_sp
# sigma=[0.08674323]; C=0.25 0.50 1.00 [2.00] 4.00; RMSE=0.1614957
### bid0_sp
#data.frame(parameter="mtry", min=080, max=100, by=10),
#glb_to_sav(); all.equal(sav_models_df, glb_models_df)
#glb_models_df <- subset(sav_models_df, id != "RFE.X.rf"); print(sort(glb_models_df$id))
glb_preproc_methods <- #NULL
### bid0_sp
# c("YeoJohnson", "center.scale",
# # crashes with train: all the RMSE metric values are missing
# # probably due to interaction vars
# "range", "pca", "ica",
# "spatialSign")
### bid0_sp
### bid1_sp
# c("YeoJohnson", "center.scale", "range", "pca", "ica", "spatialSign")
### bid1_sp
c("YeoJohnson", "center.scale", "range", "pca", "ica", "spatialSign")
# Baseline prediction model feature(s)
glb_Baseline_mdl_var <- NULL # or c("<col_name>")
glbMdlMetric_terms <- NULL # or matrix(c(
# 0,1,2,3,4,
# 2,0,1,2,3,
# 4,2,0,1,2,
# 6,4,2,0,1,
# 8,6,4,2,0
# ), byrow=TRUE, nrow=5)
glbMdlMetricSummary <- NULL # or "<metric_name>"
glbMdlMetricMaximize <- NULL # or FALSE (TRUE is not the default for both classification & regression)
glbMdlMetricSummaryFn <- NULL # or function(data, lev=NULL, model=NULL) {
# confusion_mtrx <- t(as.matrix(confusionMatrix(data$pred, data$obs)))
# #print(confusion_mtrx)
# #print(confusion_mtrx * glbMdlMetric_terms)
# metric <- sum(confusion_mtrx * glbMdlMetric_terms) / nrow(data)
# names(metric) <- glbMdlMetricSummary
# return(metric)
# }
glb_rcv_n_folds <- 3 # or NULL
glb_rcv_n_repeats <- 3 # or NULL
glb_clf_proba_threshold <- NULL # 0.5
# Model selection criteria
if (glb_is_regression)
glbMdlMetricsEval <- c("min.RMSE.OOB", "max.R.sq.OOB", "max.Adj.R.sq.fit")
#glbMdlMetricsEval <- c("min.RMSE.fit", "max.R.sq.fit", "max.Adj.R.sq.fit")
if (glb_is_classification) {
if (glb_is_binomial)
glbMdlMetricsEval <-
c("max.AUCROCR.OOB", "max.AUCpROC.OOB", "max.Accuracy.OOB", "min.aic.fit",
"max.Accuracy.fit") else
glbMdlMetricsEval <- c("max.Accuracy.OOB", "max.Kappa.OOB")
}
# select from NULL [no ensemble models], "auto" [all models better than MFO or Baseline], c(mdl_ids in glb_models_lst) [Typically top-rated models in auto]
glb_mdl_ensemble <- #"auto"
### bid0_sp
# c("RFE.X.glm"
# #, "RFE.X.bayesglm"
# , "RFE.X.glmnet", "RFE.X.rpart", "RFE.X.gbm", "RFE.X.rf", "RFE.X.svmLinear", "RFE.X.svmLinear2"
# #, "RFE.X.svmPoly", "RFE.X.svmRadial"
# , "RFE.X.earth", "RFE.X.bagEarth", "RFE.X.Interact.glmnet", "RFE.X.YeoJohnson.glmnet", "RFE.X.center.scale.glmnet", "RFE.X.spatialSign.glmnet")
### bid1_sp
# "auto"; err.abs.fit.sum=76.699774; min.RMSE.fit=0.2186429
# "RFE.X.*"; err.abs.fit.sum=; min.RMSE.fit=0.221114
# c("RFE.X.spatialSign.rf", "RFE.X.YeoJohnson.rf", "RFE.X.center.scale.rf", "RFE.X.rf", "RFE.X.avNNet", "RFE.X.bagEarth", "RFE.X.earth", "RFE.X.gbm", "RFE.X.glmnet", "RFE.X.nnet", "RFE.X.svmLinear2", "RFE.X.glm", "RFE.X.svmLinear", "RFE.X.rpart")
### bid1_sp
c("RFE.X.rpart", "RFE.X.earth", "RFE.X.glmnet", "RFE.X.glm", "RFE.X.bayesglm", "RFE.X.nnet", "RFE.X.avNNet", "RFE.X.Interact.glmnet", "RFE.X.gbm"
#,"RFE.X.bagEarth" #takes a long time
)
# Only for classifications; for regressions remove "(.*)\\.prob" form the regex
# tmp_fitobs_df <- glb_fitobs_df[, grep(paste0("^", gsub(".", "\\.", glb_rsp_var_out, fixed = TRUE), "RFE\\.X\\.(.*)\\.prob"), names(glb_fitobs_df), value = TRUE)]; cor_mtrx <- cor(tmp_fitobs_df); cor_vctr <- sort(cor_mtrx[row.names(orderBy(~-Overall, varImp(glb_models_lst[[glb_sel_mdl_id]])$importance))[1], ]); summary(cor_vctr); cor_vctr
glb_sel_mdl_id <- "Ensemble.glmnet" #select from c(NULL, "All.X.glmnet", "RFE.X.glmnet", "Ensemble.auto.glmnet", "Ensemble.glmnet")
glb_fin_mdl_id <- NULL #select from c(NULL, glb_sel_mdl_id)
glb_dsp_cols <- c("sold", ".grpid", "color", "condition", "cellular", "carrier", "storage", "biddable", "startprice", "sprice.predict.diff")
glb_out_obs <- NULL # "all" for bid._sp # select from c(NULL, "all", "new", "trn")
glb_out_vars_lst <- list()
# glb_id_var will be the first output column, by default
### !_sp
glb_out_vars_lst[["Probability1"]] <- "%<d-% paste0(glb_rsp_var_out, glb_fin_mdl_id, '.prob')"
### !_sp
### bid._sp
# glb_out_vars_lst[[glb_rsp_var_raw]] <- glb_rsp_var_raw
# glb_out_vars_lst[[paste0(head(unlist(strsplit(glb_rsp_var_out, "")), -1), collapse = "")]] <-
# "%<d-% paste0(glb_rsp_var_out, glb_fin_mdl_id)"
### bid._sp
glb_out_pfx <- "ebayipads_finmdl_RFE_sizes_bid0_"
glb_save_envir <- FALSE # or TRUE
# Depict process
glb_analytics_pn <- petrinet(name="glb_analytics_pn",
trans_df=data.frame(id=1:6,
name=c("data.training.all","data.new",
"model.selected","model.final",
"data.training.all.prediction","data.new.prediction"),
x=c( -5,-5,-15,-25,-25,-35),
y=c( -5, 5, 0, 0, -5, 5)
),
places_df=data.frame(id=1:4,
name=c("bgn","fit.data.training.all","predict.data.new","end"),
x=c( -0, -20, -30, -40),
y=c( 0, 0, 0, 0),
M0=c( 3, 0, 0, 0)
),
arcs_df=data.frame(
begin=c("bgn","bgn","bgn",
"data.training.all","model.selected","fit.data.training.all",
"fit.data.training.all","model.final",
"data.new","predict.data.new",
"data.training.all.prediction","data.new.prediction"),
end =c("data.training.all","data.new","model.selected",
"fit.data.training.all","fit.data.training.all","model.final",
"data.training.all.prediction","predict.data.new",
"predict.data.new","data.new.prediction",
"end","end")
))
#print(ggplot.petrinet(glb_analytics_pn))
print(ggplot.petrinet(glb_analytics_pn) + coord_flip())
## Loading required package: grid
glb_analytics_avl_objs <- NULL
glb_chunks_df <- myadd_chunk(NULL, "import.data")
## label step_major step_minor label_minor bgn end elapsed
## 1 import.data 1 0 0 9.734 NA NA
1.0: import data#glb_chunks_df <- myadd_chunk(NULL, "import.data")
glb_to_sav <- function() {
sav_allobs_df <<- glb_allobs_df
sav_trnobs_df <<- glb_trnobs_df
if (any(grepl("glb_fitobs_df", ls(envir=globalenv()), fixed=TRUE)) &&
!is.null(glb_fitobs_df)) sav_fitobs_df <<- glb_fitobs_df
if (any(grepl("glb_OOBobs_df", ls(envir=globalenv()), fixed=TRUE)) &&
!is.null(glb_OOBobs_df)) sav_OOBobs_df <<- glb_OOBobs_df
if (any(grepl("glb_newobs_df", ls(envir=globalenv()), fixed=TRUE)) &&
!is.null(glb_newobs_df)) {
#print("Attempting to save glb_newobs_df...")
sav_newobs_df <<- glb_newobs_df
}
if (any(grepl("glb_ctgry_df", ls(envir=globalenv()), fixed=TRUE)) &&
!is.null(glb_ctgry_df)) sav_ctgry_df <<- glb_ctgry_df
if (!is.null(glb_models_lst )) sav_models_lst <<- glb_models_lst
if (!is.null(glb_models_df )) sav_models_df <<- glb_models_df
if (any(grepl("glb_feats_df", ls(envir=globalenv()), fixed=TRUE)) &&
!is.null(glb_feats_df)) sav_feats_df <<- glb_feats_df
if (any(grepl("glb_featsimp_df", ls(envir=globalenv()), fixed=TRUE)) &&
!is.null(glb_featsimp_df)) sav_featsimp_df <<- glb_featsimp_df
}
glb_trnobs_df <- myimport_data(url=glb_trnng_url, comment="glb_trnobs_df",
force_header=TRUE)
## [1] "Reading file ./data/eBayiPadTrain.csv..."
## [1] "dimensions of data in ./data/eBayiPadTrain.csv: 1,861 rows x 11 cols"
## description
## 1 iPad is in 8.5+ out of 10 cosmetic condition!
## 2 Previously used, please read description. May show signs of use such as scratches to the screen and
## 3
## 4
## 5 Please feel free to buy. All products have been thoroughly inspected, cleaned and tested to be 100%
## 6
## biddable startprice condition cellular carrier color
## 1 0 159.99 Used 0 None Black
## 2 1 0.99 Used 1 Verizon Unknown
## 3 0 199.99 Used 0 None White
## 4 0 235.00 New other (see details) 0 None Unknown
## 5 0 199.99 Seller refurbished Unknown Unknown Unknown
## 6 1 175.00 Used 1 AT&T Space Gray
## storage productline sold UniqueID
## 1 16 iPad 2 0 10001
## 2 16 iPad 2 1 10002
## 3 16 iPad 4 1 10003
## 4 16 iPad mini 2 0 10004
## 5 Unknown Unknown 0 10005
## 6 32 iPad mini 2 1 10006
## description
## 65
## 283 Pristine condition, comes with a case and stylus.
## 948 \211\333\317Used Apple Ipad 16 gig 1st generation in Great working condition and 100% functional.Very little
## 1354
## 1366 Item still in complete working order, minor scratches, normal wear and tear but no damage. screen is
## 1840
## biddable startprice condition cellular carrier color
## 65 0 195.00 Used 0 None Unknown
## 283 1 20.00 Used 0 None Unknown
## 948 0 110.00 Seller refurbished 0 None Black
## 1354 0 300.00 Used 0 None White
## 1366 1 125.00 Used Unknown Unknown Unknown
## 1840 0 249.99 Used 1 Sprint Space Gray
## storage productline sold UniqueID
## 65 16 iPad mini 0 10065
## 283 64 iPad 1 0 10283
## 948 32 iPad 1 0 10948
## 1354 16 iPad Air 1 11354
## 1366 Unknown iPad 1 1 11366
## 1840 16 iPad Air 1 11840
## description
## 1856 Overall item is in good condition and is fully operational and ready to use. Comes with box and
## 1857 Used. Tested. Guaranteed to work. Physical condition grade B+ does have some light scratches and
## 1858 This item is brand new and was never used; however, the box and/or packaging has been opened.
## 1859
## 1860 This unit has minor scratches on case and several small scratches on the display. \nIt is in
## 1861 30 Day Warranty. Fully functional engraved iPad 1st Generation with signs of normal wear which
## biddable startprice condition cellular carrier
## 1856 0 89.50 Used 1 AT&T
## 1857 0 239.95 Used 0 None
## 1858 0 329.99 New other (see details) 0 None
## 1859 0 400.00 New 0 None
## 1860 0 89.00 Seller refurbished 0 None
## 1861 0 119.99 Used 1 AT&T
## color storage productline sold UniqueID
## 1856 Unknown 16 iPad 1 0 11856
## 1857 Black 32 iPad 4 1 11857
## 1858 Space Gray 16 iPad Air 0 11858
## 1859 Gold 16 iPad mini 3 0 11859
## 1860 Black 64 iPad 1 1 11860
## 1861 Black 64 iPad 1 0 11861
## 'data.frame': 1861 obs. of 11 variables:
## $ description: chr "iPad is in 8.5+ out of 10 cosmetic condition!" "Previously used, please read description. May show signs of use such as scratches to the screen and " "" "" ...
## $ biddable : int 0 1 0 0 0 1 1 0 1 1 ...
## $ startprice : num 159.99 0.99 199.99 235 199.99 ...
## $ condition : chr "Used" "Used" "Used" "New other (see details)" ...
## $ cellular : chr "0" "1" "0" "0" ...
## $ carrier : chr "None" "Verizon" "None" "None" ...
## $ color : chr "Black" "Unknown" "White" "Unknown" ...
## $ storage : chr "16" "16" "16" "16" ...
## $ productline: chr "iPad 2" "iPad 2" "iPad 4" "iPad mini 2" ...
## $ sold : int 0 1 1 0 0 1 1 0 1 1 ...
## $ UniqueID : int 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 ...
## - attr(*, "comment")= chr "glb_trnobs_df"
## NULL
# glb_trnobs_df <- read.delim("data/hygiene.txt", header=TRUE, fill=TRUE, sep="\t",
# fileEncoding='iso-8859-1')
# glb_trnobs_df <- read.table("data/hygiene.dat.labels", col.names=c("dirty"),
# na.strings="[none]")
# glb_trnobs_df$review <- readLines("data/hygiene.dat", n =-1)
# comment(glb_trnobs_df) <- "glb_trnobs_df"
# glb_trnobs_df <- data.frame()
# for (symbol in c("Boeing", "CocaCola", "GE", "IBM", "ProcterGamble")) {
# sym_trnobs_df <-
# myimport_data(url=gsub("IBM", symbol, glb_trnng_url), comment="glb_trnobs_df",
# force_header=TRUE)
# sym_trnobs_df$Symbol <- symbol
# glb_trnobs_df <- myrbind_df(glb_trnobs_df, sym_trnobs_df)
# }
# glb_trnobs_df <-
# glb_trnobs_df %>% dplyr::filter(Year >= 1999)
if (glb_is_separate_newobs_dataset) {
glb_newobs_df <- myimport_data(url=glb_newdt_url, comment="glb_newobs_df",
force_header=TRUE)
# To make plots / stats / checks easier in chunk:inspectORexplore.data
glb_allobs_df <- myrbind_df(glb_trnobs_df, glb_newobs_df);
comment(glb_allobs_df) <- "glb_allobs_df"
} else {
glb_allobs_df <- glb_trnobs_df; comment(glb_allobs_df) <- "glb_allobs_df"
if (!glb_split_entity_newobs_datasets) {
stop("Not implemented yet")
glb_newobs_df <- glb_trnobs_df[sample(1:nrow(glb_trnobs_df),
max(2, nrow(glb_trnobs_df) / 1000)),]
} else if (glb_split_newdata_method == "condition") {
glb_newobs_df <- do.call("subset",
list(glb_trnobs_df, parse(text=glb_split_newdata_condition)))
glb_trnobs_df <- do.call("subset",
list(glb_trnobs_df, parse(text=paste0("!(",
glb_split_newdata_condition,
")"))))
} else if (glb_split_newdata_method == "sample") {
require(caTools)
set.seed(glb_split_sample.seed)
split <- sample.split(glb_trnobs_df[, glb_rsp_var_raw],
SplitRatio=(1-glb_split_newdata_size_ratio))
glb_newobs_df <- glb_trnobs_df[!split, ]
glb_trnobs_df <- glb_trnobs_df[split ,]
} else if (glb_split_newdata_method == "copy") {
glb_trnobs_df <- glb_allobs_df
comment(glb_trnobs_df) <- "glb_trnobs_df"
glb_newobs_df <- glb_allobs_df
comment(glb_newobs_df) <- "glb_newobs_df"
} else stop("glb_split_newdata_method should be %in% c('condition', 'sample', 'copy')")
comment(glb_newobs_df) <- "glb_newobs_df"
myprint_df(glb_newobs_df)
str(glb_newobs_df)
if (glb_split_entity_newobs_datasets) {
myprint_df(glb_trnobs_df)
str(glb_trnobs_df)
}
}
## [1] "Reading file ./data/eBayiPadTest.csv..."
## [1] "dimensions of data in ./data/eBayiPadTest.csv: 798 rows x 10 cols"
## description
## 1 like new
## 2 Item is in great shape. I upgraded to the iPad Air 2 and don't need the mini any longer, even though
## 3 This iPad is working and is tested 100%. It runs great. It is in good condition. Cracked digitizer.
## 4
## 5 Grade A condition means that the Ipad is 100% working condition. Cosmetically 8/9 out of 10 - Will
## 6 Brand new factory sealed iPad in an OPEN BOX...THE BOX ITSELF IS HEAVILY DISTRESSED(see
## biddable startprice condition cellular carrier color
## 1 0 105.00 Used 1 AT&T Unknown
## 2 0 195.00 Used 0 None Unknown
## 3 0 219.99 Used 0 None Unknown
## 4 1 100.00 Used 0 None Unknown
## 5 0 210.99 Manufacturer refurbished 0 None Black
## 6 0 514.95 New other (see details) 0 None Gold
## storage productline UniqueID
## 1 32 iPad 1 11862
## 2 16 iPad mini 2 11863
## 3 64 iPad 3 11864
## 4 16 iPad mini 11865
## 5 32 iPad 3 11866
## 6 64 iPad Air 2 11867
## description
## 1 like new
## 142 iPad mini 1st gen wi-fi 16gb is in perfect working order.
## 309 In excellent condition. Minor scratches on the back. Screen in mint condition. Comes in original
## 312 iPad is in Great condition, the screen is in great condition showing only a few minor scratches, the
## 320 Good condition and fully functional
## 369
## biddable startprice condition cellular carrier color storage
## 1 0 105.00 Used 1 AT&T Unknown 32
## 142 1 0.99 Used 0 None Unknown 16
## 309 0 200.00 Used 1 AT&T Black 32
## 312 1 0.99 Used 0 None Unknown 16
## 320 1 60.00 Used 0 None White 16
## 369 1 197.97 Used 0 None Unknown 64
## productline UniqueID
## 1 iPad 1 11862
## 142 iPad mini 12003
## 309 iPad 3 12170
## 312 iPad mini 2 12173
## 320 iPad 1 12181
## 369 iPad mini 3 12230
## description
## 793 Crack on digitizer near top. Top line of digitizer does not respond to touch. Other than that, all
## 794
## 795
## 796
## 797
## 798 Slightly Used. Includes everything you need plus a nice leather case!\nThere is a slice mark on the
## biddable startprice condition cellular carrier color
## 793 0 104.00 For parts or not working 1 Unknown Black
## 794 0 95.00 Used 1 AT&T Unknown
## 795 1 199.99 Manufacturer refurbished 0 None White
## 796 0 149.99 Used 0 None Unknown
## 797 0 7.99 New Unknown Unknown Unknown
## 798 0 139.00 Used 1 Unknown Black
## storage productline UniqueID
## 793 16 iPad 2 12654
## 794 64 iPad 1 12655
## 795 16 iPad 4 12656
## 796 16 iPad 2 12657
## 797 Unknown iPad 3 12658
## 798 32 Unknown 12659
## 'data.frame': 798 obs. of 10 variables:
## $ description: chr "like new" "Item is in great shape. I upgraded to the iPad Air 2 and don't need the mini any longer, even though " "This iPad is working and is tested 100%. It runs great. It is in good condition. Cracked digitizer." "" ...
## $ biddable : int 0 0 0 1 0 0 0 0 0 1 ...
## $ startprice : num 105 195 220 100 211 ...
## $ condition : chr "Used" "Used" "Used" "Used" ...
## $ cellular : chr "1" "0" "0" "0" ...
## $ carrier : chr "AT&T" "None" "None" "None" ...
## $ color : chr "Unknown" "Unknown" "Unknown" "Unknown" ...
## $ storage : chr "32" "16" "64" "16" ...
## $ productline: chr "iPad 1" "iPad mini 2" "iPad 3" "iPad mini" ...
## $ UniqueID : int 11862 11863 11864 11865 11866 11867 11868 11869 11870 11871 ...
## - attr(*, "comment")= chr "glb_newobs_df"
## NULL
if ((num_nas <- sum(is.na(glb_trnobs_df[, glb_rsp_var_raw]))) > 0)
stop("glb_trnobs_df$", glb_rsp_var_raw, " contains NAs for ", num_nas, " obs")
if (nrow(glb_trnobs_df) == nrow(glb_allobs_df))
warning("glb_trnobs_df same as glb_allobs_df")
if (nrow(glb_newobs_df) == nrow(glb_allobs_df))
warning("glb_newobs_df same as glb_allobs_df")
if (length(glb_drop_vars) > 0) {
warning("dropping vars: ", paste0(glb_drop_vars, collapse=", "))
glb_allobs_df <- glb_allobs_df[, setdiff(names(glb_allobs_df), glb_drop_vars)]
glb_trnobs_df <- glb_trnobs_df[, setdiff(names(glb_trnobs_df), glb_drop_vars)]
glb_newobs_df <- glb_newobs_df[, setdiff(names(glb_newobs_df), glb_drop_vars)]
}
#stop(here"); sav_allobs_df <- glb_allobs_df # glb_allobs_df <- sav_allobs_df
# Combine trnent & newobs into glb_allobs_df for easier manipulation
glb_trnobs_df$.src <- "Train"; glb_newobs_df$.src <- "Test";
glbFeatsExclude <- union(glbFeatsExclude, ".src")
glb_allobs_df <- myrbind_df(glb_trnobs_df, glb_newobs_df)
comment(glb_allobs_df) <- "glb_allobs_df"
# Check for duplicates in glb_id_var
if (length(glb_id_var) == 0) {
warning("using .rownames as identifiers for observations")
glb_allobs_df$.rownames <- rownames(glb_allobs_df)
glb_trnobs_df$.rownames <- rownames(subset(glb_allobs_df, .src == "Train"))
glb_newobs_df$.rownames <- rownames(subset(glb_allobs_df, .src == "Test"))
glb_id_var <- ".rownames"
}
if (sum(duplicated(glb_allobs_df[, glb_id_var, FALSE])) > 0)
stop(glb_id_var, " duplicated in glb_allobs_df")
glbFeatsExclude <- union(glbFeatsExclude, glb_id_var)
glb_allobs_df <- orderBy(reformulate(glb_id_var), glb_allobs_df)
glb_trnobs_df <- glb_newobs_df <- NULL
# For Tableau
write.csv(glb_allobs_df, "data/eBayiPadAll.csv", row.names=FALSE)
#stop(here"); glb_to_sav()
# Make any data corrections here
glb_allobs_df[glb_allobs_df[, glb_id_var] == 10986, "cellular"] <- "1"
glb_allobs_df[glb_allobs_df[, glb_id_var] == 10986, "carrier"] <- "T-Mobile"
# - Merge glb_obs_stack_condition & glbObsDropCondition
# - Derive glb_obs_stack|drop_chk_vars from condition automatically
# - Implement glb_obs_stack_condition & glb_obs_stack_chk_vars options
dsp_partition_stats <- function(obs_df, vars=NULL) {
lcl_vars <- NULL
for (var in c(vars, glb_rsp_var_raw)) {
if ((length(unique(obs_df[, var])) > 5) && is.numeric(obs_df[, var])) {
cut_var <- paste0(var, ".cut.fctr")
obs_df[, cut_var] <- cut(obs_df[, var], 3)
lcl_vars <- union(lcl_vars, cut_var)
} else lcl_vars <- union(lcl_vars, var)
}
print("Partition stats:")
print(mycreate_sqlxtab_df(obs_df, union(lcl_vars, ".src")))
for (var in lcl_vars) {
print(freq_df <- mycreate_sqlxtab_df(obs_df, union(var, ".src")))
print(myplot_hbar(freq_df, ".src", ".n", colorcol_name=var))
}
print(mycreate_sqlxtab_df(obs_df, ".src"))
# if (length(unique(glb_allobs_df[, glb_rsp_var_raw])) > 5) {
# cut_var <- paste0(glb_rsp_var_raw, ".cut.fctr")
# glb_allobs_df[, cut_var] <- cut(glb_allobs_df[, glb_rsp_var_raw], 3)
# glbFeatsExclude <- union(glbFeatsExclude, cut_var)
# glb_obs_stack_chk_vars <- union(cut_var, glb_obs_stack_chk_vars)
# } else glb_obs_stack_chk_vars <- union(glb_rsp_var_raw, glb_obs_stack_chk_vars)
# #glb_obs_stack_chk_vars <- union(glb_obs_stack_chk_vars, ".src")
# print(mycreate_sqlxtab_df(glb_allobs_df, union(var, ".src")))
# print(mycreate_sqlxtab_df(glb_allobs_df, union(glb_obs_stack_chk_vars, ".src")))
# for (var in glb_obs_stack_chk_vars) {
# print(mycreate_sqlxtab_df(glb_allobs_df, union(var, ".src")))
# }
# print(mycreate_sqlxtab_df(glb_allobs_df, ".src"))
}
myget_symbols <- function(txt) {
if (is.null(txt)) return(NULL)
#print(getParseData(parse(text=txt, keep.source=TRUE)))
return(unique(subset(getParseData(parse(text=txt, keep.source=TRUE)),
token == "SYMBOL")$text))
}
# tokens <- unlist(strsplit(gsub("[[:punct:]|[:space:]]", " ", glbObsDropCondition), " "))
# tokens <- tokens[tokens != ""]
# glb_obs_drop_chk_vars <- c("biddable") # or NULL
dsp_partition_stats(obs_df=glb_allobs_df, vars=myget_symbols(glbObsDropCondition))
## [1] "Partition stats:"
## Loading required package: sqldf
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: DBI
## Loading required package: tcltk
## UniqueID.cut.fctr productline biddable sold .src .n
## 1 (1e+04,1.09e+04] iPad 2 1 1 Train 90
## 2 (1.18e+04,1.27e+04] iPad 2 0 NA Test 84
## 3 (1e+04,1.09e+04] iPad 1 1 1 Train 78
## 4 (1.18e+04,1.27e+04] iPad 2 1 NA Test 70
## 5 (1.09e+04,1.18e+04] iPad mini 0 0 Train 68
## 6 (1.09e+04,1.18e+04] iPad 2 0 0 Train 67
## 7 (1.09e+04,1.18e+04] Unknown 0 0 Train 64
## 8 (1e+04,1.09e+04] iPad mini 1 1 Train 60
## 9 (1.18e+04,1.27e+04] iPad mini 1 NA Test 59
## 10 (1.09e+04,1.18e+04] iPad Air 2 0 0 Train 54
## 11 (1.18e+04,1.27e+04] Unknown 0 NA Test 54
## 12 (1.09e+04,1.18e+04] iPad Air 0 0 Train 52
## 13 (1.18e+04,1.27e+04] iPad mini 0 NA Test 52
## 14 (1.18e+04,1.27e+04] iPad 1 1 NA Test 50
## 15 (1e+04,1.09e+04] iPad 3 1 1 Train 50
## 16 (1.09e+04,1.18e+04] iPad 1 0 0 Train 43
## 17 (1.18e+04,1.27e+04] iPad 4 0 NA Test 42
## 18 (1e+04,1.09e+04] iPad 2 0 0 Train 42
## 19 (1e+04,1.09e+04] iPad Air 1 1 Train 42
## 20 (1.18e+04,1.27e+04] Unknown 1 NA Test 38
## 21 (1.18e+04,1.27e+04] iPad 1 0 NA Test 38
## 22 (1.18e+04,1.27e+04] iPad Air 0 NA Test 38
## 23 (1.09e+04,1.18e+04] iPad 4 0 0 Train 36
## 24 (1.09e+04,1.18e+04] iPad mini 1 1 Train 36
## 25 (1.18e+04,1.27e+04] iPad Air 1 NA Test 36
## 26 (1.18e+04,1.27e+04] iPad Air 2 0 NA Test 36
## 27 (1.18e+04,1.27e+04] iPad mini 2 1 NA Test 34
## 28 (1e+04,1.09e+04] iPad mini 0 0 Train 34
## 29 (1.09e+04,1.18e+04] iPad mini 3 0 0 Train 33
## 30 (1e+04,1.09e+04] iPad 4 0 0 Train 33
## 31 (1.09e+04,1.18e+04] Unknown 1 1 Train 32
## 32 (1.18e+04,1.27e+04] iPad 3 0 NA Test 32
## 33 (1e+04,1.09e+04] iPad 4 1 1 Train 32
## 34 (1e+04,1.09e+04] iPad Air 0 0 Train 32
## 35 (1.09e+04,1.18e+04] iPad 3 0 0 Train 29
## 36 (1e+04,1.09e+04] iPad 1 0 0 Train 29
## 37 (1e+04,1.09e+04] iPad Air 2 1 1 Train 29
## 38 (1.09e+04,1.18e+04] iPad mini 2 0 0 Train 28
## 39 (1.18e+04,1.27e+04] iPad 4 1 NA Test 26
## 40 (1.18e+04,1.27e+04] iPad Air 2 1 NA Test 26
## 41 (1e+04,1.09e+04] iPad 3 0 0 Train 26
## 42 (1.18e+04,1.27e+04] iPad mini 3 0 NA Test 24
## 43 (1e+04,1.09e+04] Unknown 1 1 Train 24
## 44 (1e+04,1.09e+04] iPad Air 2 0 0 Train 24
## 45 (1e+04,1.09e+04] iPad mini 2 1 1 Train 24
## 46 (1.09e+04,1.18e+04] iPad 2 1 1 Train 23
## 47 (1.09e+04,1.18e+04] iPad mini 1 0 Train 23
## 48 (1.18e+04,1.27e+04] iPad 3 1 NA Test 23
## 49 (1e+04,1.09e+04] Unknown 0 0 Train 23
## 50 (1.18e+04,1.27e+04] iPad mini 2 0 NA Test 22
## 51 (1.09e+04,1.18e+04] iPad Air 2 1 1 Train 21
## 52 (1.09e+04,1.18e+04] Unknown 1 0 Train 20
## 53 (1.09e+04,1.18e+04] iPad Air 1 1 Train 19
## 54 (1.09e+04,1.18e+04] iPad mini 0 1 Train 18
## 55 (1.09e+04,1.18e+04] iPad 2 0 1 Train 17
## 56 (1e+04,1.09e+04] iPad mini 2 0 0 Train 16
## 57 (1.09e+04,1.18e+04] Unknown 0 1 Train 15
## 58 (1.09e+04,1.18e+04] iPad 1 1 1 Train 15
## 59 (1.09e+04,1.18e+04] iPad 2 1 0 Train 15
## 60 (1e+04,1.09e+04] iPad 1 1 0 Train 15
## 61 (1.18e+04,1.27e+04] iPad mini 3 1 NA Test 14
## 62 (1e+04,1.09e+04] iPad 2 0 1 Train 14
## 63 (1e+04,1.09e+04] iPad mini 0 1 Train 14
## 64 (1e+04,1.09e+04] iPad mini 1 0 Train 14
## 65 (1e+04,1.09e+04] iPad mini 3 0 0 Train 14
## 66 (1.09e+04,1.18e+04] iPad 1 0 1 Train 13
## 67 (1e+04,1.09e+04] iPad 1 0 1 Train 13
## 68 (1.09e+04,1.18e+04] iPad 3 1 1 Train 12
## 69 (1.09e+04,1.18e+04] iPad Air 2 0 1 Train 12
## 70 (1e+04,1.09e+04] iPad mini 3 1 1 Train 12
## 71 (1.09e+04,1.18e+04] iPad 4 1 1 Train 11
## 72 (1.09e+04,1.18e+04] iPad Air 2 1 0 Train 11
## 73 (1e+04,1.09e+04] iPad 3 0 1 Train 11
## 74 (1.09e+04,1.18e+04] iPad mini 2 0 1 Train 10
## 75 (1.09e+04,1.18e+04] iPad mini 2 1 1 Train 10
## 76 (1e+04,1.09e+04] iPad 2 1 0 Train 10
## 77 (1e+04,1.09e+04] iPad 4 1 0 Train 10
## 78 (1.09e+04,1.18e+04] iPad 4 1 0 Train 9
## 79 (1e+04,1.09e+04] iPad 3 1 0 Train 9
## 80 (1e+04,1.09e+04] iPad Air 0 1 Train 9
## 81 (1.09e+04,1.18e+04] iPad 4 0 1 Train 8
## 82 (1.09e+04,1.18e+04] iPad mini 2 1 0 Train 8
## 83 (1.09e+04,1.18e+04] iPad mini 3 1 0 Train 8
## 84 (1.18e+04,1.27e+04] iPad 1 0 0 Train 8
## 85 (1e+04,1.09e+04] iPad 4 0 1 Train 8
## 86 (1.09e+04,1.18e+04] iPad 1 1 0 Train 7
## 87 (1.18e+04,1.27e+04] Unknown 0 0 Train 7
## 88 (1.18e+04,1.27e+04] iPad Air 2 0 0 Train 7
## 89 (1e+04,1.09e+04] iPad Air 1 0 Train 7
## 90 (1e+04,1.09e+04] iPad Air 2 0 1 Train 7
## 91 (1.09e+04,1.18e+04] iPad Air 0 1 Train 6
## 92 (1.09e+04,1.18e+04] iPad Air 1 0 Train 6
## 93 (1.09e+04,1.18e+04] iPad mini 3 0 1 Train 6
## 94 (1.09e+04,1.18e+04] iPad mini 3 1 1 Train 6
## 95 (1.18e+04,1.27e+04] Unknown 0 1 Train 6
## 96 (1.18e+04,1.27e+04] iPad mini 0 0 Train 6
## 97 (1e+04,1.09e+04] Unknown 1 0 Train 6
## 98 (1.09e+04,1.18e+04] iPad 3 0 1 Train 5
## 99 (1.09e+04,1.18e+04] iPad 3 1 0 Train 5
## 100 (1.18e+04,1.27e+04] iPad 2 0 0 Train 5
## 101 (1.18e+04,1.27e+04] iPad 4 0 0 Train 5
## 102 (1.18e+04,1.27e+04] iPad Air 0 0 Train 5
## 103 (1e+04,1.09e+04] Unknown 0 1 Train 5
## 104 (1.18e+04,1.27e+04] iPad 3 0 0 Train 4
## 105 (1.18e+04,1.27e+04] iPad mini 3 0 0 Train 4
## 106 (1e+04,1.09e+04] iPad Air 2 1 0 Train 4
## 107 (1e+04,1.09e+04] iPad mini 2 0 1 Train 4
## 108 (1e+04,1.09e+04] iPad mini 2 1 0 Train 4
## 109 (1.09e+04,1.18e+04] iPad mini Retina 0 0 Train 3
## 110 (1.18e+04,1.27e+04] iPad 1 0 1 Train 3
## 111 (1.18e+04,1.27e+04] iPad 1 1 1 Train 3
## 112 (1.18e+04,1.27e+04] iPad 4 0 1 Train 3
## 113 (1.18e+04,1.27e+04] iPad mini 1 1 Train 3
## 114 (1e+04,1.09e+04] iPad mini 3 0 1 Train 3
## 115 (1e+04,1.09e+04] iPad mini 3 1 0 Train 3
## 116 (1.18e+04,1.27e+04] Unknown 1 0 Train 2
## 117 (1.18e+04,1.27e+04] iPad 2 1 1 Train 2
## 118 (1.18e+04,1.27e+04] iPad 4 1 1 Train 2
## 119 (1.18e+04,1.27e+04] iPad Air 0 1 Train 2
## 120 (1.18e+04,1.27e+04] iPad mini 2 0 0 Train 2
## 121 (1.09e+04,1.18e+04] iPad 5 1 1 Train 1
## 122 (1.09e+04,1.18e+04] iPad mini Retina 0 1 Train 1
## 123 (1.18e+04,1.27e+04] iPad 2 0 1 Train 1
## 124 (1.18e+04,1.27e+04] iPad 3 0 1 Train 1
## 125 (1.18e+04,1.27e+04] iPad 3 1 1 Train 1
## 126 (1.18e+04,1.27e+04] iPad Air 2 0 1 Train 1
## 127 (1.18e+04,1.27e+04] iPad Air 2 1 1 Train 1
## 128 (1.18e+04,1.27e+04] iPad mini 0 1 Train 1
## 129 (1.18e+04,1.27e+04] iPad mini 2 0 1 Train 1
## 130 (1.18e+04,1.27e+04] iPad mini 3 1 0 Train 1
## 131 (1.18e+04,1.27e+04] iPad mini Retina 0 1 Train 1
## 132 (1e+04,1.09e+04] iPad mini Retina 0 0 Train 1
## 133 (1e+04,1.09e+04] iPad mini Retina 0 1 Train 1
## 134 (1e+04,1.09e+04] iPad mini Retina 1 1 Train 1
## UniqueID.cut.fctr .src .n
## 1 (1e+04,1.09e+04] Train 887
## 2 (1.09e+04,1.18e+04] Train 886
## 3 (1.18e+04,1.27e+04] Test 798
## 4 (1.18e+04,1.27e+04] Train 88
## productline .src .n
## 1 iPad 2 Train 286
## 2 iPad mini Train 277
## 3 iPad 1 Train 227
## 4 Unknown Train 204
## 5 iPad Air Train 180
## 6 iPad Air 2 Train 171
## 7 iPad 4 Train 157
## 8 iPad 2 Test 154
## 9 iPad 3 Train 153
## 10 iPad mini Test 111
## 11 iPad mini 2 Train 107
## 12 Unknown Test 92
## 13 iPad mini 3 Train 90
## 14 iPad 1 Test 88
## 15 iPad Air Test 74
## 16 iPad 4 Test 68
## 17 iPad Air 2 Test 62
## 18 iPad mini 2 Test 56
## 19 iPad 3 Test 55
## 20 iPad mini 3 Test 38
## 21 iPad mini Retina Train 8
## 22 iPad 5 Train 1
## biddable .src .n
## 1 0 Train 1024
## 2 1 Train 837
## 3 0 Test 422
## 4 1 Test 376
## sold .src .n
## 1 0 Train 1001
## 2 1 Train 860
## 3 NA Test 798
## .src .n
## 1 Train 1861
## 2 Test 798
if (!is.null(glbObsDropCondition)) {
print(sprintf("Running glbObsDropCondition filter: %s", glbObsDropCondition))
glb_allobs_df <- do.call("subset",
list(glb_allobs_df, parse(text=paste0("!(", glbObsDropCondition, ")"))))
dsp_partition_stats(obs_df=glb_allobs_df, vars=myget_symbols(glbObsDropCondition))
}
## [1] "Running glbObsDropCondition filter: (UniqueID %in% c(NULL\n , 11234 #sold=0; 2 other dups(10306, 11503) are sold=1\n , 11844 #sold=0; 3 other dups(11721, 11738, 11812) are sold=1\n )) | \n (productline %in% c('iPad 5', 'iPad mini Retina')) |\n (biddable != 0) # bid0_sp\n # | (biddable == 0) # bid1_sp\n "
## [1] "Partition stats:"
## UniqueID.cut.fctr productline biddable sold .src .n
## 1 (1.18e+04,1.27e+04] iPad 2 0 NA Test 84
## 2 (1.09e+04,1.18e+04] iPad mini 0 0 Train 68
## 3 (1.09e+04,1.18e+04] iPad 2 0 0 Train 67
## 4 (1.09e+04,1.18e+04] Unknown 0 0 Train 64
## 5 (1.09e+04,1.18e+04] iPad Air 2 0 0 Train 54
## 6 (1.18e+04,1.27e+04] Unknown 0 NA Test 54
## 7 (1.09e+04,1.18e+04] iPad Air 0 0 Train 52
## 8 (1.18e+04,1.27e+04] iPad mini 0 NA Test 52
## 9 (1.09e+04,1.18e+04] iPad 1 0 0 Train 42
## 10 (1.18e+04,1.27e+04] iPad 4 0 NA Test 42
## 11 (1e+04,1.09e+04] iPad 2 0 0 Train 42
## 12 (1.18e+04,1.27e+04] iPad 1 0 NA Test 38
## 13 (1.18e+04,1.27e+04] iPad Air 0 NA Test 38
## 14 (1.09e+04,1.18e+04] iPad 4 0 0 Train 36
## 15 (1.18e+04,1.27e+04] iPad Air 2 0 NA Test 36
## 16 (1e+04,1.09e+04] iPad mini 0 0 Train 34
## 17 (1.09e+04,1.18e+04] iPad mini 3 0 0 Train 33
## 18 (1e+04,1.09e+04] iPad 4 0 0 Train 33
## 19 (1.18e+04,1.27e+04] iPad 3 0 NA Test 32
## 20 (1e+04,1.09e+04] iPad Air 0 0 Train 32
## 21 (1.09e+04,1.18e+04] iPad 3 0 0 Train 29
## 22 (1e+04,1.09e+04] iPad 1 0 0 Train 29
## 23 (1.09e+04,1.18e+04] iPad mini 2 0 0 Train 28
## 24 (1e+04,1.09e+04] iPad 3 0 0 Train 26
## 25 (1.18e+04,1.27e+04] iPad mini 3 0 NA Test 24
## 26 (1e+04,1.09e+04] iPad Air 2 0 0 Train 24
## 27 (1e+04,1.09e+04] Unknown 0 0 Train 23
## 28 (1.18e+04,1.27e+04] iPad mini 2 0 NA Test 22
## 29 (1.09e+04,1.18e+04] iPad mini 0 1 Train 18
## 30 (1.09e+04,1.18e+04] iPad 2 0 1 Train 17
## 31 (1e+04,1.09e+04] iPad mini 2 0 0 Train 16
## 32 (1.09e+04,1.18e+04] Unknown 0 1 Train 15
## 33 (1e+04,1.09e+04] iPad 2 0 1 Train 14
## 34 (1e+04,1.09e+04] iPad mini 0 1 Train 14
## 35 (1e+04,1.09e+04] iPad mini 3 0 0 Train 14
## 36 (1.09e+04,1.18e+04] iPad 1 0 1 Train 13
## 37 (1e+04,1.09e+04] iPad 1 0 1 Train 13
## 38 (1.09e+04,1.18e+04] iPad Air 2 0 1 Train 12
## 39 (1e+04,1.09e+04] iPad 3 0 1 Train 11
## 40 (1.09e+04,1.18e+04] iPad mini 2 0 1 Train 10
## 41 (1e+04,1.09e+04] iPad Air 0 1 Train 9
## 42 (1.09e+04,1.18e+04] iPad 4 0 1 Train 8
## 43 (1e+04,1.09e+04] iPad 4 0 1 Train 8
## 44 (1.18e+04,1.27e+04] Unknown 0 0 Train 7
## 45 (1.18e+04,1.27e+04] iPad 1 0 0 Train 7
## 46 (1.18e+04,1.27e+04] iPad Air 2 0 0 Train 7
## 47 (1e+04,1.09e+04] iPad Air 2 0 1 Train 7
## 48 (1.09e+04,1.18e+04] iPad Air 0 1 Train 6
## 49 (1.09e+04,1.18e+04] iPad mini 3 0 1 Train 6
## 50 (1.18e+04,1.27e+04] Unknown 0 1 Train 6
## 51 (1.18e+04,1.27e+04] iPad mini 0 0 Train 6
## 52 (1.09e+04,1.18e+04] iPad 3 0 1 Train 5
## 53 (1.18e+04,1.27e+04] iPad 2 0 0 Train 5
## 54 (1.18e+04,1.27e+04] iPad 4 0 0 Train 5
## 55 (1.18e+04,1.27e+04] iPad Air 0 0 Train 5
## 56 (1e+04,1.09e+04] Unknown 0 1 Train 5
## 57 (1.18e+04,1.27e+04] iPad 3 0 0 Train 4
## 58 (1.18e+04,1.27e+04] iPad mini 3 0 0 Train 4
## 59 (1e+04,1.09e+04] iPad mini 2 0 1 Train 4
## 60 (1.18e+04,1.27e+04] iPad 1 0 1 Train 3
## 61 (1.18e+04,1.27e+04] iPad 4 0 1 Train 3
## 62 (1e+04,1.09e+04] iPad mini 3 0 1 Train 3
## 63 (1.18e+04,1.27e+04] iPad Air 0 1 Train 2
## 64 (1.18e+04,1.27e+04] iPad mini 2 0 0 Train 2
## 65 (1.18e+04,1.27e+04] iPad 2 0 1 Train 1
## 66 (1.18e+04,1.27e+04] iPad 3 0 1 Train 1
## 67 (1.18e+04,1.27e+04] iPad Air 2 0 1 Train 1
## 68 (1.18e+04,1.27e+04] iPad mini 0 1 Train 1
## 69 (1.18e+04,1.27e+04] iPad mini 2 0 1 Train 1
## UniqueID.cut.fctr .src .n
## 1 (1.09e+04,1.18e+04] Train 583
## 2 (1.18e+04,1.27e+04] Test 422
## 3 (1e+04,1.09e+04] Train 361
## 4 (1.18e+04,1.27e+04] Train 71
## productline .src .n
## 1 iPad 2 Train 146
## 2 iPad mini Train 141
## 3 Unknown Train 120
## 4 iPad 1 Train 107
## 5 iPad Air Train 106
## 6 iPad Air 2 Train 105
## 7 iPad 4 Train 93
## 8 iPad 2 Test 84
## 9 iPad 3 Train 76
## 10 iPad mini 2 Train 61
## 11 iPad mini 3 Train 60
## 12 Unknown Test 54
## 13 iPad mini Test 52
## 14 iPad 4 Test 42
## 15 iPad 1 Test 38
## 16 iPad Air Test 38
## 17 iPad Air 2 Test 36
## 18 iPad 3 Test 32
## 19 iPad mini 3 Test 24
## 20 iPad mini 2 Test 22
## biddable .src .n
## 1 0 Train 1015
## 2 0 Test 422
## sold .src .n
## 1 0 Train 798
## 2 NA Test 422
## 3 1 Train 217
## .src .n
## 1 Train 1015
## 2 Test 422
# Check for duplicates by all features
require(gdata)
## Loading required package: gdata
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
##
## The following object is masked from 'package:stats':
##
## nobs
##
## The following object is masked from 'package:utils':
##
## object.size
#print(names(glb_allobs_df))
dup_allobs_df <- glb_allobs_df[duplicated2(subset(glb_allobs_df,
select=-c(UniqueID, sold, .src))), ]
dup_allobs_df <- orderBy(~productline+description+startprice+biddable, dup_allobs_df)
print(sprintf("Found %d duplicates by all features:", nrow(dup_allobs_df)))
## [1] "Found 139 duplicates by all features:"
myprint_df(dup_allobs_df)
## description
## 385
## 390
## 1204
## 1315
## 2464
## 1029 A device listed in near mint used cosmetic condition with light blemishes from use. Housing &
## biddable startprice condition cellular carrier color storage
## 385 0 15 Used 0 None Black 16
## 390 0 15 Used 0 None Black 16
## 1204 0 15 Used 0 None Black 16
## 1315 0 399 Used Unknown AT&T White 32
## 2464 0 399 Used Unknown AT&T White 32
## 1029 0 108 Used 1 AT&T White 32
## productline sold UniqueID .src
## 385 Unknown 0 10385 Train
## 390 Unknown 0 10390 Train
## 1204 Unknown 0 11204 Train
## 1315 Unknown 0 11315 Train
## 2464 Unknown NA 12464 Test
## 1029 Unknown 0 11029 Train
## description
## 934 A device listed in near mint used cosmetic condition with light blemishes from use. Housing &
## 737 Near perfect cosmetic, perfect working condition.
## 1605
## 239 Like New
## 1167
## 1569
## biddable startprice condition cellular carrier color
## 934 0 162.00 Used 1 Verizon White
## 737 0 220.00 Used 0 None Unknown
## 1605 0 229.00 For parts or not working Unknown Unknown Unknown
## 239 0 398.99 Used 0 None Unknown
## 1167 0 375.00 Used 0 None White
## 1569 0 375.00 Used 0 None White
## storage productline sold UniqueID .src
## 934 16 iPad 2 0 10934 Train
## 737 16 iPad 3 0 10737 Train
## 1605 16 iPad Air 1 11605 Train
## 239 128 iPad Air 0 10239 Train
## 1167 64 iPad mini 2 1 11167 Train
## 1569 64 iPad mini 2 0 11569 Train
## description biddable startprice condition cellular carrier color
## 8 0 329.99 New 0 None White
## 660 0 329.99 New 0 None White
## 319 0 345.00 New 0 None Gold
## 1886 0 345.00 New 0 None Gold
## 1363 0 498.88 New 1 Verizon Gold
## 1394 0 498.88 New 1 Verizon Gold
## storage productline sold UniqueID .src
## 8 16 iPad mini 3 0 10008 Train
## 660 16 iPad mini 3 0 10660 Train
## 319 16 iPad mini 3 1 10319 Train
## 1886 16 iPad mini 3 NA 11886 Test
## 1363 16 iPad mini 3 0 11363 Train
## 1394 16 iPad mini 3 0 11394 Train
# print(dup_allobs_df[, c(glb_id_var, glb_rsp_var_raw,
# "description", "startprice", "biddable")])
# write.csv(dup_allobs_df[, c("UniqueID"), FALSE], "ebayipads_dups.csv", row.names=FALSE)
dupobs_df <- tidyr::unite(dup_allobs_df, "allfeats", -c(sold, UniqueID, .src), sep="#")
# dupobs_df <- dplyr::group_by(dupobs_df, allfeats)
# dupobs_df <- dupobs_df[, "UniqueID", FALSE]
# dupobs_df <- ungroup(dupobs_df)
#
# dupobs_df$.rownames <- row.names(dupobs_df)
grpobs_df <- data.frame(allfeats=unique(dupobs_df[, "allfeats"]))
grpobs_df$.grpid <- row.names(grpobs_df)
dupobs_df <- merge(dupobs_df, grpobs_df)
# dupobs_tbl <- table(dupobs_df$.grpid)
# print(max(dupobs_tbl))
# print(dupobs_tbl[which.max(dupobs_tbl)])
# print(dupobs_df[dupobs_df$.grpid == names(dupobs_tbl[which.max(dupobs_tbl)]), ])
# print(dupobs_df[dupobs_df$.grpid == 106, ])
# for (grpid in c(9, 17, 31, 36, 53))
# print(dupobs_df[dupobs_df$.grpid == grpid, ])
dupgrps_df <- as.data.frame(table(dupobs_df$.grpid, dupobs_df$sold, useNA="ifany"))
names(dupgrps_df)[c(1,2)] <- c(".grpid", "sold")
dupgrps_df$.grpid <- as.numeric(as.character(dupgrps_df$.grpid))
dupgrps_df <- tidyr::spread(dupgrps_df, sold, Freq)
names(dupgrps_df)[-1] <- paste("sold", names(dupgrps_df)[-1], sep=".")
dupgrps_df$.freq <- sapply(1:nrow(dupgrps_df), function(row) sum(dupgrps_df[row, -1]))
myprint_df(orderBy(~-.freq, dupgrps_df))
## .grpid sold.0 sold.1 sold.NA .freq
## 18 18 0 2 2 4
## 1 1 3 0 0 3
## 16 16 0 3 0 3
## 31 31 3 0 0 3
## 37 37 3 0 0 3
## 55 55 3 0 0 3
## .grpid sold.0 sold.1 sold.NA .freq
## 10 10 0 2 0 2
## 23 23 1 0 1 2
## 28 28 2 0 0 2
## 33 33 1 1 0 2
## 45 45 2 0 0 2
## 48 48 0 1 1 2
## .grpid sold.0 sold.1 sold.NA .freq
## 61 61 2 0 0 2
## 62 62 1 0 1 2
## 63 63 1 1 0 2
## 64 64 2 0 0 2
## 65 65 0 1 1 2
## 66 66 2 0 0 2
print("sold Conflicts:")
## [1] "sold Conflicts:"
print(subset(dupgrps_df, (sold.0 > 0) & (sold.1 > 0)))
## .grpid sold.0 sold.1 sold.NA .freq
## 8 8 1 1 0 2
## 9 9 1 1 0 2
## 27 27 1 1 0 2
## 32 32 1 1 0 2
## 33 33 1 1 0 2
## 43 43 1 1 0 2
## 47 47 1 1 0 2
## 57 57 1 1 0 2
## 63 63 1 1 0 2
#dupobs_df[dupobs_df$.grpid == 4, ]
glb_allobs_df <- merge(glb_allobs_df, dupobs_df[, c(glb_id_var, ".grpid")],
by=glb_id_var, all.x=TRUE)
if (nrow(subset(dupgrps_df, (sold.0 > 0) & (sold.1 > 0) & (sold.0 != sold.1))) > 0)
stop("Duplicate conflicts are resolvable")
#subset(glb_allobs_df, .grpid %in% c(25))
#mydsp_obs(list(productline.contains="iPad 1", storage.contains="16", color.contains="Black", carrier.contains="None", cellular.contains="0", condition.contains="Used", startprice=80), cols=c("productline", "storage", "color", "carrier", "cellular", "condition", "startprice", "sold"))
print("Test & Train Groups:")
## [1] "Test & Train Groups:"
print(subset(dupgrps_df, (sold.NA > 0)))
## .grpid sold.0 sold.1 sold.NA .freq
## 2 2 1 0 1 2
## 4 4 0 0 2 2
## 5 5 1 0 1 2
## 7 7 0 0 2 2
## 11 11 1 0 1 2
## 13 13 1 0 1 2
## 15 15 0 1 1 2
## 17 17 0 0 2 2
## 18 18 0 2 2 4
## 19 19 0 1 1 2
## 20 20 0 0 2 2
## 21 21 1 0 1 2
## 22 22 1 0 1 2
## 23 23 1 0 1 2
## 24 24 1 0 1 2
## 25 25 0 0 2 2
## 26 26 1 0 1 2
## 35 35 0 0 2 2
## 36 36 1 0 1 2
## 38 38 1 0 1 2
## 39 39 0 0 2 2
## 41 41 0 1 1 2
## 42 42 1 0 1 2
## 46 46 0 1 1 2
## 48 48 0 1 1 2
## 49 49 1 0 1 2
## 52 52 0 1 1 2
## 53 53 1 0 1 2
## 56 56 1 0 1 2
## 62 62 1 0 1 2
## 65 65 0 1 1 2
glbFeatsExclude <- c(".grpid", glbFeatsExclude)
#stop(here"); glb_to_sav(); all.equal(sav_allobs_df, glb_allobs_df); glb_allobs_df <- sav_allobs_df
if (!is.null(glbInpMerge)) {
print("Running glbInpMerge specs...")
obsMrg <- data.frame()
for (fName in glbInpMerge$fnames) {
print(sprintf(" Appending rows from %s...", fName))
obsMrg <- rbind(obsMrg, read.csv(fName))
}
glb_allobs_df <- merge(glb_allobs_df, obsMrg, all.x = TRUE)
}
## [1] "Running glbInpMerge specs..."
## [1] " Appending rows from ebayipads_finmdl_bid0_sp_out.csv..."
## [1] " Appending rows from ebayipads_mdlens_bid1_sp_out.csv..."
dsp_partition_stats(obs_df = glb_allobs_df,
vars = myget_symbols(glb_obs_repartition_train_condition))
## [1] "Partition stats:"
## sold .src .n
## 1 0 Train 798
## 2 NA Test 422
## 3 1 Train 217
## sold .src .n
## 1 0 Train 798
## 2 NA Test 422
## 3 1 Train 217
## .src .n
## 1 Train 1015
## 2 Test 422
if (!is.null(glb_obs_repartition_train_condition)) {
print(sprintf("Running glb_obs_repartition_train_condition filter: %s",
glb_obs_repartition_train_condition))
# glb_allobs_df <- mutate(glb_allobs_df, .src=ifelse(!is.na(sold) & (sold == 1),
# "Train", "Test"))
# glb_allobs_df <- mutate_(glb_allobs_df,
# .src=interp(ifelse(eval(parse(text="!is.na(sold) & (sold == 1)")),
# "Train", "Test")))
# glb_allobs_df <- within(glb_allobs_df, {
# .src <- ifelse(eval(parse(text="!is.na(sold) & (sold == 1)")),
# "Train", "Test")
# })
# glb_allobs_df <- within(glb_allobs_df, {
# if(eval(parse(text="!is.na(sold) & (sold == 1)"))) .src <- "Train" else
# .src <- "Test"
# })
# with(glb_allobs_df, {
# src <- ifelse(eval(parse(text="!is.na(sold) & (sold == 1)")),
# "Train", "Test")
# })
# glb_allobs_df$.src <- sapply(1:nrow(glb_allobs_df), function (row_ix) ifelse)
# glb_allobs_df[parse(text=paste0("!(", glbObsDropCondition, ")")), ".src"] <- do.call("subset",
# list(glb_allobs_df, ))
glb_trnobs_df <- do.call("subset", list(glb_allobs_df,
parse(text=paste0(" (", glb_obs_repartition_train_condition, ")"))))
glb_trnobs_df$.src <- "Train"
glb_newobs_df <- do.call("subset", list(glb_allobs_df,
parse(text=paste0("!(", glb_obs_repartition_train_condition, ")"))))
glb_newobs_df$.src <- "Test"
glb_allobs_df <- rbind(glb_trnobs_df, glb_newobs_df)
dsp_partition_stats(obs_df = glb_allobs_df,
vars = myget_symbols(glb_obs_repartition_train_condition))
}
glb_chunks_df <- myadd_chunk(glb_chunks_df, "inspect.data", major.inc=TRUE)
## label step_major step_minor label_minor bgn end elapsed
## 1 import.data 1 0 0 9.734 20.675 10.942
## 2 inspect.data 2 0 0 20.676 NA NA
2.0: inspect data#print(str(glb_allobs_df))
#View(glb_allobs_df)
dsp_class_dstrb <- function(var) {
xtab_df <- mycreate_xtab_df(glb_allobs_df, c(".src", var))
rownames(xtab_df) <- xtab_df$.src
xtab_df <- subset(xtab_df, select=-.src)
print(xtab_df)
print(xtab_df / rowSums(xtab_df, na.rm=TRUE))
}
# Performed repeatedly in other chunks
glb_chk_data <- function(featsExclude = glbFeatsExclude,
fctrMaxUniqVals = glbFctrMaxUniqVals) {
# Histogram of predictor in glb_trnobs_df & glb_newobs_df
print(myplot_histogram(glb_allobs_df, glb_rsp_var_raw) + facet_wrap(~ .src))
if (glb_is_classification)
dsp_class_dstrb(var=ifelse(glb_rsp_var %in% names(glb_allobs_df),
glb_rsp_var, glb_rsp_var_raw))
mycheck_problem_data(glb_allobs_df, featsExclude, fctrMaxUniqVals)
}
glb_chk_data()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 422 rows containing non-finite values (stat_bin).
## Loading required package: reshape2
## sold.0 sold.1 sold.NA
## Test NA NA 422
## Train 798 217 NA
## sold.0 sold.1 sold.NA
## Test NA NA 1
## Train 0.7862069 0.2137931 NA
## [1] "numeric data missing in : "
## sold
## 422
## [1] "numeric data w/ 0s in : "
## biddable sold
## 1437 798
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## description condition cellular carrier color storage
## 760 0 0 0 0 0
## productline .grpid
## 0 NA
# Create new features that help diagnostics
if (!is.null(glb_map_rsp_raw_to_var)) {
glb_allobs_df[, glb_rsp_var] <-
glb_map_rsp_raw_to_var(glb_allobs_df[, glb_rsp_var_raw])
mycheck_map_results(mapd_df=glb_allobs_df,
from_col_name=glb_rsp_var_raw, to_col_name=glb_rsp_var)
if (glb_is_classification) dsp_class_dstrb(glb_rsp_var)
}
## sold sold.fctr .n
## 1 0 N 798
## 2 NA <NA> 422
## 3 1 Y 217
## Warning: Removed 1 rows containing missing values (position_stack).
## sold.fctr.N sold.fctr.Y sold.fctr.NA
## Test NA NA 422
## Train 798 217 NA
## sold.fctr.N sold.fctr.Y sold.fctr.NA
## Test NA NA 1
## Train 0.7862069 0.2137931 NA
# check distribution of all numeric data
dsp_numeric_feats_dstrb <- function(feats_vctr) {
grid.newpage()
pushViewport(viewport(layout = grid.layout(ceiling(length(feats_vctr) / 2.0), 2)))
pltIx <- 1
for (feat in feats_vctr) {
#print(sprintf("feat: %s", feat))
if (glb_is_regression)
gp <- myplot_scatter(df=glb_allobs_df, ycol_name=glb_rsp_var, xcol_name=feat,
smooth=TRUE)
if (glb_is_classification)
#gp <- myplot_box(df=glb_allobs_df, ycol_names=feat, xcol_name=glb_rsp_var)
gp <- myplot_violin(glb_allobs_df, ycol_names = feat, xcol_name = glb_rsp_var)
if (inherits(glb_allobs_df[, feat], "factor"))
gp <- gp + facet_wrap(reformulate(feat))
print(gp + labs(title = feat),
vp = viewport(layout.pos.row = ceiling(pltIx / 2.0),
layout.pos.col = ((pltIx - 1) %% 2) + 1))
pltIx <- pltIx + 1
}
}
# dsp_numeric_feats_dstrb(setdiff(names(glb_allobs_df), union(myfind_chr_cols_df(glb_allobs_df), c(glb_rsp_var_raw, glb_rsp_var)))[2:6]) # dsp_numeric_feats_dstrb(c("startprice", "sprice.root3", "sprice.predict.diff"))
add_new_diag_feats <- function(obs_df, ref_df=glb_allobs_df) {
require(plyr)
set.seed(169)
obs_df <- mutate(obs_df,
# <col_name>.NA=is.na(<col_name>),
# <col_name>.fctr=factor(<col_name>,
# as.factor(union(obs_df$<col_name>, obs_twin_df$<col_name>))),
# <col_name>.fctr=relevel(factor(<col_name>,
# as.factor(union(obs_df$<col_name>, obs_twin_df$<col_name>))),
# "<ref_val>"),
# <col2_name>.fctr=relevel(factor(ifelse(<col1_name> == <val>, "<oth_val>", "<ref_val>")),
# as.factor(c("R", "<ref_val>")),
# ref="<ref_val>"),
# This doesn't work - use sapply instead
# <col_name>.fctr_num=grep(<col_name>, levels(<col_name>.fctr)),
#
# Date.my=as.Date(strptime(Date, "%m/%d/%y %H:%M")),
# Year=year(Date.my),
# Month=months(Date.my),
# Weekday=weekdays(Date.my)
# <col_name>=<table>[as.character(<col2_name>)],
# <col_name>=as.numeric(<col2_name>),
# <col_name> = trunc(<col2_name> / 100),
.rnorm = rnorm(n=nrow(obs_df))
)
# If levels of a factor are different across obs_df & glb_newobs_df; predict.glm fails
# Transformations not handled by mutate
# obs_df$<col_name>.fctr.num <- sapply(1:nrow(obs_df),
# function(row_ix) grep(obs_df[row_ix, "<col_name>"],
# levels(obs_df[row_ix, "<col_name>.fctr"])))
#print(summary(obs_df))
#print(sapply(names(obs_df), function(col) sum(is.na(obs_df[, col]))))
return(obs_df)
}
glb_allobs_df <- add_new_diag_feats(glb_allobs_df)
## Loading required package: plyr
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following objects are masked from 'package:gdata':
##
## combine, first, last
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#stop(here"); sav_allobs_df <- glb_allobs_df # glb_allobs_df <- sav_allobs_df
# Merge some <descriptor>
# glb_allobs_df$<descriptor>.my <- glb_allobs_df$<descriptor>
# glb_allobs_df[grepl("\\bAIRPORT\\b", glb_allobs_df$<descriptor>.my),
# "<descriptor>.my"] <- "AIRPORT"
# Check distributions of newly transformed / extracted vars
# Enhancement: remove vars that were displayed ealier
dsp_numeric_feats_dstrb(feats_vctr=setdiff(names(glb_allobs_df),
c(myfind_chr_cols_df(glb_allobs_df), glb_rsp_var_raw, glb_rsp_var,
glbFeatsExclude)))
# Convert factors to dummy variables
# Build splines require(splines); bsBasis <- bs(training$age, df=3)
#pairs(subset(glb_trnobs_df, select=-c(col_symbol)))
# Check for glb_newobs_df & glb_trnobs_df features range mismatches
# Other diagnostics:
# print(subset(glb_trnobs_df, <col1_name> == max(glb_trnobs_df$<col1_name>, na.rm=TRUE) &
# <col2_name> <= mean(glb_trnobs_df$<col1_name>, na.rm=TRUE)))
# print(glb_trnobs_df[which.max(glb_trnobs_df$<col_name>),])
# print(<col_name>_freq_glb_trnobs_df <- mycreate_tbl_df(glb_trnobs_df, "<col_name>"))
# print(which.min(table(glb_trnobs_df$<col_name>)))
# print(which.max(table(glb_trnobs_df$<col_name>)))
# print(which.max(table(glb_trnobs_df$<col1_name>, glb_trnobs_df$<col2_name>)[, 2]))
# print(table(glb_trnobs_df$<col1_name>, glb_trnobs_df$<col2_name>))
# print(table(is.na(glb_trnobs_df$<col1_name>), glb_trnobs_df$<col2_name>))
# print(table(sign(glb_trnobs_df$<col1_name>), glb_trnobs_df$<col2_name>))
# print(mycreate_xtab_df(glb_trnobs_df, <col1_name>))
# print(mycreate_xtab_df(glb_trnobs_df, c(<col1_name>, <col2_name>)))
# print(<col1_name>_<col2_name>_xtab_glb_trnobs_df <-
# mycreate_xtab_df(glb_trnobs_df, c("<col1_name>", "<col2_name>")))
# <col1_name>_<col2_name>_xtab_glb_trnobs_df[is.na(<col1_name>_<col2_name>_xtab_glb_trnobs_df)] <- 0
# print(<col1_name>_<col2_name>_xtab_glb_trnobs_df <-
# mutate(<col1_name>_<col2_name>_xtab_glb_trnobs_df,
# <col3_name>=(<col1_name> * 1.0) / (<col1_name> + <col2_name>)))
# print(mycreate_sqlxtab_df(glb_allobs_df, c("<col1_name>", "<col2_name>")))
# print(<col2_name>_min_entity_arr <-
# sort(tapply(glb_trnobs_df$<col1_name>, glb_trnobs_df$<col2_name>, min, na.rm=TRUE)))
# print(<col1_name>_na_by_<col2_name>_arr <-
# sort(tapply(glb_trnobs_df$<col1_name>.NA, glb_trnobs_df$<col2_name>, mean, na.rm=TRUE)))
# Other plots:
# print(myplot_box(df=glb_trnobs_df, ycol_names="<col1_name>"))
# print(myplot_box(df=glb_trnobs_df, ycol_names="<col1_name>", xcol_name="<col2_name>"))
# print(myplot_line(subset(glb_trnobs_df, Symbol %in% c("CocaCola", "ProcterGamble")),
# "Date.POSIX", "StockPrice", facet_row_colnames="Symbol") +
# geom_vline(xintercept=as.numeric(as.POSIXlt("2003-03-01"))) +
# geom_vline(xintercept=as.numeric(as.POSIXlt("1983-01-01")))
# )
# print(myplot_line(subset(glb_trnobs_df, Date.POSIX > as.POSIXct("2004-01-01")),
# "Date.POSIX", "StockPrice") +
# geom_line(aes(color=Symbol)) +
# coord_cartesian(xlim=c(as.POSIXct("1990-01-01"),
# as.POSIXct("2000-01-01"))) +
# coord_cartesian(ylim=c(0, 250)) +
# geom_vline(xintercept=as.numeric(as.POSIXlt("1997-09-01"))) +
# geom_vline(xintercept=as.numeric(as.POSIXlt("1997-11-01")))
# )
# print(myplot_scatter(glb_allobs_df, "<col1_name>", "<col2_name>", smooth=TRUE))
# print(myplot_scatter(glb_allobs_df, "<col1_name>", "<col2_name>", colorcol_name="<Pred.fctr>") +
# geom_point(data=subset(glb_allobs_df, <condition>),
# mapping=aes(x=<x_var>, y=<y_var>), color="red", shape=4, size=5) +
# geom_vline(xintercept=84))
glb_chunks_df <- myadd_chunk(glb_chunks_df, "scrub.data", major.inc=FALSE)
## label step_major step_minor label_minor bgn end elapsed
## 2 inspect.data 2 0 0 20.676 25.487 4.811
## 3 scrub.data 2 1 1 25.488 NA NA
2.1: scrub datamycheck_problem_data(glb_allobs_df, featsExclude = glbFeatsExclude,
fctrMaxUniqVals = glbFctrMaxUniqVals)
## [1] "numeric data missing in : "
## sold sold.fctr
## 422 422
## [1] "numeric data w/ 0s in : "
## biddable sold
## 1437 798
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## description condition cellular carrier color storage
## 760 0 0 0 0 0
## productline .grpid
## 0 NA
findOffendingCharacter <- function(x, maxStringLength=256){
print(x)
for (c in 1:maxStringLength){
offendingChar <- substr(x,c,c)
#print(offendingChar) #uncomment if you want the indiv characters printed
#the next character is the offending multibyte Character
}
}
# string_vector <- c("test", "Se\x96ora", "works fine")
# lapply(string_vector, findOffendingCharacter)
# lapply(glb_allobs_df$description[29], findOffendingCharacter)
dsp_hdlxtab <- function(str)
print(mycreate_sqlxtab_df(glb_allobs_df[sel_obs(Headline.contains=str), ],
c("Headline.pfx", "Headline", glb_rsp_var)))
#dsp_hdlxtab("(1914)|(1939)")
dsp_catxtab <- function(str)
print(mycreate_sqlxtab_df(glb_allobs_df[sel_obs(Headline.contains=str), ],
c("Headline.pfx", "NewsDesk", "SectionName", "SubsectionName", glb_rsp_var)))
# dsp_catxtab("1914)|(1939)")
# dsp_catxtab("19(14|39|64):")
# dsp_catxtab("19..:")
# Merge some categories
# glb_allobs_df$myCategory <-
# plyr::revalue(glb_allobs_df$myCategory, c(
# "#Business Day#Dealbook" = "Business#Business Day#Dealbook",
# "#Business Day#Small Business" = "Business#Business Day#Small Business",
# "dummy" = "dummy"
# ))
# ctgry_xtab_df <- orderBy(reformulate(c("-", ".n")),
# mycreate_sqlxtab_df(glb_allobs_df,
# c("myCategory", "NewsDesk", "SectionName", "SubsectionName", glb_rsp_var)))
# myprint_df(ctgry_xtab_df)
# write.table(ctgry_xtab_df, paste0(glb_out_pfx, "ctgry_xtab.csv"),
# row.names=FALSE)
# ctgry_cast_df <- orderBy(~ -Y -NA, dcast(ctgry_xtab_df,
# myCategory + NewsDesk + SectionName + SubsectionName ~
# Popular.fctr, sum, value.var=".n"))
# myprint_df(ctgry_cast_df)
# write.table(ctgry_cast_df, paste0(glb_out_pfx, "ctgry_cast.csv"),
# row.names=FALSE)
# print(ctgry_sum_tbl <- table(glb_allobs_df$myCategory, glb_allobs_df[, glb_rsp_var],
# useNA="ifany"))
dsp_chisq.test <- function(...) {
sel_df <- glb_allobs_df[sel_obs(...) &
!is.na(glb_allobs_df$Popular), ]
sel_df$.marker <- 1
ref_df <- glb_allobs_df[!is.na(glb_allobs_df$Popular), ]
mrg_df <- merge(ref_df[, c(glb_id_var, "Popular")],
sel_df[, c(glb_id_var, ".marker")], all.x=TRUE)
mrg_df[is.na(mrg_df)] <- 0
print(mrg_tbl <- table(mrg_df$.marker, mrg_df$Popular))
print("Rows:Selected; Cols:Popular")
#print(mrg_tbl)
print(chisq.test(mrg_tbl))
}
# dsp_chisq.test(Headline.contains="[Ee]bola")
# dsp_chisq.test(Snippet.contains="[Ee]bola")
# dsp_chisq.test(Abstract.contains="[Ee]bola")
# print(mycreate_sqlxtab_df(glb_allobs_df[sel_obs(Headline.contains="[Ee]bola"), ],
# c(glb_rsp_var, "NewsDesk", "SectionName", "SubsectionName")))
# print(table(glb_allobs_df$NewsDesk, glb_allobs_df$SectionName))
# print(table(glb_allobs_df$SectionName, glb_allobs_df$SubsectionName))
# print(table(glb_allobs_df$NewsDesk, glb_allobs_df$SectionName, glb_allobs_df$SubsectionName))
# glb_allobs_df$myCategory.fctr <- as.factor(glb_allobs_df$myCategory)
print(table(glb_allobs_df$cellular, glb_allobs_df$carrier, useNA="ifany"))
##
## AT&T None Sprint T-Mobile Unknown Verizon
## 0 0 798 0 0 0 0
## 1 172 0 25 17 98 120
## Unknown 3 3 0 0 201 0
# glb_allobs_df[(glb_allobs_df$cellular %in% c("Unknown")) &
# (glb_allobs_df$carrier %in% c("AT&T", "Other")),
# c(glb_id_var, glb_rsp_var_raw, "description", "carrier", "cellular")]
glb_allobs_df[(glb_allobs_df$cellular %in% c("Unknown")) &
(glb_allobs_df$carrier %in% c("AT&T", "Other")),
"cellular"] <- "1"
# glb_allobs_df[(glb_allobs_df$cellular %in% c("Unknown")) &
# (glb_allobs_df$carrier %in% c("None")),
# c(glb_id_var, glb_rsp_var_raw, "description", "carrier", "cellular")]
glb_allobs_df[(glb_allobs_df$cellular %in% c("Unknown")) &
(glb_allobs_df$carrier %in% c("None")),
"cellular"] <- "0"
print(table(glb_allobs_df$cellular, glb_allobs_df$carrier, useNA="ifany"))
##
## AT&T None Sprint T-Mobile Unknown Verizon
## 0 0 801 0 0 0 0
## 1 175 0 25 17 98 120
## Unknown 0 0 0 0 201 0
2.1: scrub dataglb_chunks_df <- myadd_chunk(glb_chunks_df, "transform.data", major.inc=FALSE)
## label step_major step_minor label_minor bgn end elapsed
## 3 scrub.data 2 1 1 25.488 26.509 1.021
## 4 transform.data 2 2 2 26.509 NA NA
### Mapping dictionary
#sav_allobs_df <- glb_allobs_df; glb_allobs_df <- sav_allobs_df
if (!is.null(glb_map_vars)) {
for (feat in glb_map_vars) {
map_df <- myimport_data(url=glb_map_urls[[feat]],
comment="map_df",
print_diagn=TRUE)
glb_allobs_df <- mymap_codes(glb_allobs_df, feat, names(map_df)[2],
map_df, map_join_col_name=names(map_df)[1],
map_tgt_col_name=names(map_df)[2])
}
glbFeatsExclude <- union(glbFeatsExclude, glb_map_vars)
}
### Forced Assignments
#stop(here"); sav_allobs_df <- glb_allobs_df; glb_allobs_df <- sav_allobs_df
for (feat in glb_assign_vars) {
new_feat <- paste0(feat, ".my")
print(sprintf("Forced Assignments for: %s -> %s...", feat, new_feat))
glb_allobs_df[, new_feat] <- glb_allobs_df[, feat]
pairs <- glb_assign_pairs_lst[[feat]]
for (pair_ix in 1:length(pairs$from)) {
if (is.na(pairs$from[pair_ix]))
nobs <- nrow(filter(glb_allobs_df,
is.na(eval(parse(text=feat),
envir=glb_allobs_df)))) else
nobs <- sum(glb_allobs_df[, feat] == pairs$from[pair_ix])
#nobs <- nrow(filter(glb_allobs_df, is.na(Married.fctr))) ; print(nobs)
if ((is.na(pairs$from[pair_ix])) && (is.na(pairs$to[pair_ix])))
stop("what are you trying to do ???")
if (is.na(pairs$from[pair_ix]))
glb_allobs_df[is.na(glb_allobs_df[, feat]), new_feat] <-
pairs$to[pair_ix] else
glb_allobs_df[glb_allobs_df[, feat] == pairs$from[pair_ix], new_feat] <-
pairs$to[pair_ix]
print(sprintf(" %s -> %s for %s obs",
pairs$from[pair_ix], pairs$to[pair_ix], format(nobs, big.mark=",")))
}
glbFeatsExclude <- union(glbFeatsExclude, glb_assign_vars)
}
### Derivations using mapping functions
#stop(here"); sav_allobs_df <- glb_allobs_df; glb_allobs_df <- sav_allobs_df
for (new_feat in glb_derive_vars) {
print(sprintf("Creating new feature: %s...", new_feat))
args_lst <- NULL
for (arg in glbFeatsDerive[[new_feat]]$args)
args_lst[[arg]] <- glb_allobs_df[, arg]
glb_allobs_df[, new_feat] <- do.call(glbFeatsDerive[[new_feat]]$mapfn, args_lst)
}
## [1] "Creating new feature: sprice.root2..."
## [1] "Creating new feature: sprice.log10..."
## [1] "Creating new feature: sprice.d20nexp..."
## [1] "Creating new feature: sprice.predict.diff..."
## [1] "Creating new feature: spdiff.cut.fctr..."
## [1] "Creating new feature: descr.my..."
## [1] "Creating new feature: prdl.descr.my.fctr..."
#stop(here")
#hex_vctr <- c("\n", "\211", "\235", "\317", "\333")
hex_regex <- paste0(c("\n", "\211", "\235", "\317", "\333"), collapse="|")
for (obs_id in c(10029, 10948, 10136, 10178, 11514, 11904, 12157, 12210, 12659)) {
# tmp_str <- unlist(strsplit(glb_allobs_df[row_pos, "descr.my"], ""))
# glb_allobs_df[row_pos, "descr.my"] <- paste0(tmp_str[!tmp_str %in% hex_vctr],
# collapse="")
row_pos <- which(glb_allobs_df$UniqueID == obs_id)
# glb_allobs_df[row_pos, "descr.my"] <-
# gsub(hex_regex, " ", glb_allobs_df[row_pos, "descr.my"])
}
2.2: transform data#```{r extract_features, cache=FALSE, eval=!is.null(glbFeatsText)}
glb_chunks_df <- myadd_chunk(glb_chunks_df, "extract.features", major.inc=TRUE)
## label step_major step_minor label_minor bgn end elapsed
## 4 transform.data 2 2 2 26.509 26.903 0.394
## 5 extract.features 3 0 0 26.903 NA NA
extract.features_chunk_df <- myadd_chunk(NULL, "extract.features_bgn")
## label step_major step_minor label_minor bgn end
## 1 extract.features_bgn 1 0 0 26.911 NA
## elapsed
## 1 NA
# Create new features that help prediction
# <col_name>.lag.2 <- lag(zoo(glb_trnobs_df$<col_name>), -2, na.pad=TRUE)
# glb_trnobs_df[, "<col_name>.lag.2"] <- coredata(<col_name>.lag.2)
# <col_name>.lag.2 <- lag(zoo(glb_newobs_df$<col_name>), -2, na.pad=TRUE)
# glb_newobs_df[, "<col_name>.lag.2"] <- coredata(<col_name>.lag.2)
#
# glb_newobs_df[1, "<col_name>.lag.2"] <- glb_trnobs_df[nrow(glb_trnobs_df) - 1,
# "<col_name>"]
# glb_newobs_df[2, "<col_name>.lag.2"] <- glb_trnobs_df[nrow(glb_trnobs_df),
# "<col_name>"]
# glb_allobs_df <- mutate(glb_allobs_df,
# A.P.http=ifelse(grepl("http",Added,fixed=TRUE), 1, 0)
# )
#
# glb_trnobs_df <- mutate(glb_trnobs_df,
# )
#
# glb_newobs_df <- mutate(glb_newobs_df,
# )
# Convert dates to numbers
# typically, dates come in as chars;
# so this must be done before converting chars to factors
#stop(here"); sav_allobs_df <- glb_allobs_df #; glb_allobs_df <- sav_allobs_df
if (!is.null(glb_date_vars)) {
glb_allobs_df <- cbind(glb_allobs_df,
myextract_dates_df(df=glb_allobs_df, vars=glb_date_vars,
id_vars=glb_id_var, rsp_var=glb_rsp_var))
for (sfx in c("", ".POSIX"))
glbFeatsExclude <-
union(glbFeatsExclude,
paste(glb_date_vars, sfx, sep=""))
for (feat in glb_date_vars) {
glb_allobs_df <- orderBy(reformulate(paste0(feat, ".POSIX")), glb_allobs_df)
# print(myplot_scatter(glb_allobs_df, xcol_name=paste0(feat, ".POSIX"),
# ycol_name=glb_rsp_var, colorcol_name=glb_rsp_var))
print(myplot_scatter(glb_allobs_df[glb_allobs_df[, paste0(feat, ".POSIX")] >=
strptime("2012-12-01", "%Y-%m-%d"), ],
xcol_name=paste0(feat, ".POSIX"),
ycol_name=glb_rsp_var, colorcol_name=paste0(feat, ".wkend")))
# Create features that measure the gap between previous timestamp in the data
require(zoo)
z <- zoo(as.numeric(as.POSIXlt(glb_allobs_df[, paste0(feat, ".POSIX")])))
glb_allobs_df[, paste0(feat, ".zoo")] <- z
print(head(glb_allobs_df[, c(glb_id_var, feat, paste0(feat, ".zoo"))]))
print(myplot_scatter(glb_allobs_df[glb_allobs_df[, paste0(feat, ".POSIX")] >
strptime("2012-10-01", "%Y-%m-%d"), ],
xcol_name=paste0(feat, ".zoo"), ycol_name=glb_rsp_var,
colorcol_name=glb_rsp_var))
b <- zoo(, seq(nrow(glb_allobs_df)))
last1 <- as.numeric(merge(z-lag(z, -1), b, all=TRUE)); last1[is.na(last1)] <- 0
glb_allobs_df[, paste0(feat, ".last1.log")] <- log(1 + last1)
print(gp <- myplot_box(df=glb_allobs_df[glb_allobs_df[,
paste0(feat, ".last1.log")] > 0, ],
ycol_names=paste0(feat, ".last1.log"),
xcol_name=glb_rsp_var))
last2 <- as.numeric(merge(z-lag(z, -2), b, all=TRUE)); last2[is.na(last2)] <- 0
glb_allobs_df[, paste0(feat, ".last2.log")] <- log(1 + last2)
print(gp <- myplot_box(df=glb_allobs_df[glb_allobs_df[,
paste0(feat, ".last2.log")] > 0, ],
ycol_names=paste0(feat, ".last2.log"),
xcol_name=glb_rsp_var))
last10 <- as.numeric(merge(z-lag(z, -10), b, all=TRUE)); last10[is.na(last10)] <- 0
glb_allobs_df[, paste0(feat, ".last10.log")] <- log(1 + last10)
print(gp <- myplot_box(df=glb_allobs_df[glb_allobs_df[,
paste0(feat, ".last10.log")] > 0, ],
ycol_names=paste0(feat, ".last10.log"),
xcol_name=glb_rsp_var))
last100 <- as.numeric(merge(z-lag(z, -100), b, all=TRUE)); last100[is.na(last100)] <- 0
glb_allobs_df[, paste0(feat, ".last100.log")] <- log(1 + last100)
print(gp <- myplot_box(df=glb_allobs_df[glb_allobs_df[,
paste0(feat, ".last100.log")] > 0, ],
ycol_names=paste0(feat, ".last100.log"),
xcol_name=glb_rsp_var))
glb_allobs_df <- orderBy(reformulate(glb_id_var), glb_allobs_df)
glbFeatsExclude <- union(glbFeatsExclude,
c(paste0(feat, ".zoo")))
# all2$last3 = as.numeric(merge(z-lag(z, -3), b, all = TRUE))
# all2$last5 = as.numeric(merge(z-lag(z, -5), b, all = TRUE))
# all2$last10 = as.numeric(merge(z-lag(z, -10), b, all = TRUE))
# all2$last20 = as.numeric(merge(z-lag(z, -20), b, all = TRUE))
# all2$last50 = as.numeric(merge(z-lag(z, -50), b, all = TRUE))
#
#
# # order table
# all2 = all2[order(all2$id),]
#
# ## fill in NAs
# # count averages
# na.avg = all2 %>% group_by(weekend, hour) %>% dplyr::summarise(
# last1=mean(last1, na.rm=TRUE),
# last3=mean(last3, na.rm=TRUE),
# last5=mean(last5, na.rm=TRUE),
# last10=mean(last10, na.rm=TRUE),
# last20=mean(last20, na.rm=TRUE),
# last50=mean(last50, na.rm=TRUE)
# )
#
# # fill in averages
# na.merge = merge(all2, na.avg, by=c("weekend","hour"))
# na.merge = na.merge[order(na.merge$id),]
# for(i in c("last1", "last3", "last5", "last10", "last20", "last50")) {
# y = paste0(i, ".y")
# idx = is.na(all2[[i]])
# all2[idx,][[i]] <- na.merge[idx,][[y]]
# }
# rm(na.avg, na.merge, b, i, idx, n, pd, sec, sh, y, z)
}
}
rm(last1, last10, last100)
## Warning in rm(last1, last10, last100): object 'last1' not found
## Warning in rm(last1, last10, last100): object 'last10' not found
## Warning in rm(last1, last10, last100): object 'last100' not found
# Create factors of string variables
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "factorize.str.vars"), major.inc=TRUE)
## label step_major step_minor label_minor
## 1 extract.features_bgn 1 0 0
## 2 extract.features_factorize.str.vars 2 0 0
## bgn end elapsed
## 1 26.911 26.926 0.016
## 2 26.927 NA NA
#stop(here"); sav_allobs_df <- glb_allobs_df; #glb_allobs_df <- sav_allobs_df
print(str_vars <- myfind_chr_cols_df(glb_allobs_df))
## description condition cellular carrier color
## "description" "condition" "cellular" "carrier" "color"
## storage productline .src .grpid descr.my
## "storage" "productline" ".src" ".grpid" "descr.my"
if (length(str_vars <- setdiff(str_vars,
c(glbFeatsExclude, glbFeatsText))) > 0) {
for (var in str_vars) {
warning("Creating factors of string variable: ", var,
": # of unique values: ", length(unique(glb_allobs_df[, var])))
glb_allobs_df[, paste0(var, ".fctr")] <-
relevel(factor(glb_allobs_df[, var]),
names(which.max(table(glb_allobs_df[, var], useNA = "ifany"))))
}
glbFeatsExclude <- union(glbFeatsExclude, str_vars)
}
## Warning: Creating factors of string variable: condition: # of unique
## values: 6
## Warning: Creating factors of string variable: cellular: # of unique values:
## 3
## Warning: Creating factors of string variable: carrier: # of unique values:
## 6
## Warning: Creating factors of string variable: color: # of unique values: 5
## Warning: Creating factors of string variable: storage: # of unique values:
## 5
if (!is.null(glbFeatsText)) {
require(foreach)
require(gsubfn)
require(stringr)
require(tm)
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "process.text"), major.inc=TRUE)
chk_pattern_freq <- function(rex_str, ignore.case=TRUE) {
match_mtrx <- str_extract_all(txt_vctr, regex(rex_str, ignore_case=ignore.case),
simplify=TRUE)
match_df <- as.data.frame(match_mtrx[match_mtrx != ""])
names(match_df) <- "pattern"
return(mycreate_sqlxtab_df(match_df, "pattern"))
}
# match_lst <- gregexpr("\\bok(?!ay)", txt_vctr[746], ignore.case = FALSE, perl=TRUE); print(match_lst)
dsp_pattern <- function(rex_str, ignore.case=TRUE, print.all=TRUE) {
match_lst <- gregexpr(rex_str, txt_vctr, ignore.case = ignore.case, perl=TRUE)
match_lst <- regmatches(txt_vctr, match_lst)
match_df <- data.frame(matches=sapply(match_lst,
function (elems) paste(elems, collapse="#")))
match_df <- subset(match_df, matches != "")
if (print.all)
print(match_df)
return(match_df)
}
dsp_matches <- function(rex_str, ix) {
print(match_pos <- gregexpr(rex_str, txt_vctr[ix], perl=TRUE))
print(str_sub(txt_vctr[ix], (match_pos[[1]] / 100) * 99 + 0,
(match_pos[[1]] / 100) * 100 + 100))
}
myapply_gsub <- function(...) {
if ((length_lst <- length(names(gsub_map_lst))) == 0)
return(txt_vctr)
for (ptn_ix in 1:length_lst) {
if ((ptn_ix %% 10) == 0)
print(sprintf("running gsub for %02d (of %02d): #%s#...", ptn_ix,
length(names(gsub_map_lst)), names(gsub_map_lst)[ptn_ix]))
txt_vctr <- gsub(names(gsub_map_lst)[ptn_ix], gsub_map_lst[[ptn_ix]],
txt_vctr, ...)
}
return(txt_vctr)
}
myapply_txtmap <- function(txt_vctr, ...) {
nrows <- nrow(glb_txt_map_df)
for (ptn_ix in 1:nrows) {
if ((ptn_ix %% 10) == 0)
print(sprintf("running gsub for %02d (of %02d): #%s#...", ptn_ix,
nrows, glb_txt_map_df[ptn_ix, "rex_str"]))
txt_vctr <- gsub(glb_txt_map_df[ptn_ix, "rex_str"],
glb_txt_map_df[ptn_ix, "rpl_str"],
txt_vctr, ...)
}
return(txt_vctr)
#print(txt_vctr <- glb_allobs_df[glb_allobs_df$UniqueID == 11329, "descr.my"])
#strsplit(txt_vctr, "")[[1]][1]
#ptn_ix <- 2; glb_txt_map_df[ptn_ix, ]
#gsub(glb_txt_map_df[ptn_ix, "rex_str"], glb_txt_map_df[ptn_ix, "rpl_str"], txt_vctr)
#print(match_lst <- gregexpr(glb_txt_map_df[ptn_ix, "rex_str"], txt_vctr))
#strsplit(glb_txt_map_df[ptn_ix, "rex_str"], "")[[1]]
}
chk.equal <- function(bgn, end) {
print(all.equal(sav_txt_lst[["Headline"]][bgn:end],
glb_txt_chr_lst[["Headline"]][bgn:end]))
}
dsp.equal <- function(bgn, end) {
print(sav_txt_lst[["Headline"]][bgn:end])
print(glb_txt_chr_lst[["Headline"]][bgn:end])
}
#sav_txt_lst <- glb_txt_chr_lst; all.equal(sav_txt_lst, glb_txt_chr_lst)
#all.equal(sav_txt_lst[["Headline"]][1:4200], glb_txt_chr_lst[["Headline"]][1:4200])
#chk.equal( 1, 100)
#dsp.equal(86, 90)
txt_map_filename <- paste0(glb_txt_munge_filenames_pfx, "map.csv")
if (!file.exists(txt_map_filename))
stop(txt_map_filename, " not found!")
glb_txt_map_df <- read.csv(txt_map_filename, comment.char="#", strip.white=TRUE)
glb_txt_chr_lst <- list();
print(sprintf("Building glb_txt_chr_lst..."))
glb_txt_chr_lst <- foreach(txt_var = glbFeatsText) %dopar% {
# for (txt_var in glbFeatsText) {
txt_vctr <- glb_allobs_df[, txt_var]
names(txt_vctr) <- glb_allobs_df[, glb_id_var]
# myapply_txtmap shd be created as a tm_map::content_transformer ?
#print(glb_txt_map_df)
#txt_var=glbFeatsText[3]; txt_vctr <- glb_txt_chr_lst[[txt_var]]
#print(rex_str <- glb_txt_map_df[3, "rex_str"])
#print(rex_str <- glb_txt_map_df[glb_txt_map_df$rex_str == "\\bWall St\\.", "rex_str"])
#print(rex_str <- glb_txt_map_df[grepl("du Pont", glb_txt_map_df$rex_str), "rex_str"])
#print(rex_str <- glb_txt_map_df[glb_txt_map_df$rpl_str == "versus", "rex_str"])
#print(tmp_vctr <- grep(rex_str, txt_vctr, value=TRUE, ignore.case=FALSE))
#ret_lst <- regexec(rex_str, txt_vctr, ignore.case=FALSE); ret_lst <- regmatches(txt_vctr, ret_lst); ret_vctr <- sapply(1:length(ret_lst), function(pos_ix) ifelse(length(ret_lst[[pos_ix]]) > 0, ret_lst[[pos_ix]], "")); print(ret_vctr <- ret_vctr[ret_vctr != ""])
#gsub(rex_str, glb_txt_map_df[glb_txt_map_df$rex_str == rex_str, "rpl_str"], tmp_vctr, ignore.case=FALSE)
#grep("Hong Hong", txt_vctr, value=TRUE)
txt_vctr <- myapply_txtmap(txt_vctr, ignore.case=FALSE)
}
names(glb_txt_chr_lst) <- glbFeatsText
for (txt_var in glbFeatsText) {
print(sprintf("Remaining OK in %s:", txt_var))
txt_vctr <- glb_txt_chr_lst[[txt_var]]
print(chk_pattern_freq(rex_str <- "(?<!(BO|HO|LO))OK(?!(E\\!|ED|IE|IN|S ))",
ignore.case=FALSE))
match_df <- dsp_pattern(rex_str, ignore.case=FALSE, print.all=FALSE)
for (row in row.names(match_df))
dsp_matches(rex_str, ix=as.numeric(row))
print(chk_pattern_freq(rex_str <- "Ok(?!(a\\.|ay|in|ra|um))", ignore.case=FALSE))
match_df <- dsp_pattern(rex_str, ignore.case=FALSE, print.all=FALSE)
for (row in row.names(match_df))
dsp_matches(rex_str, ix=as.numeric(row))
print(chk_pattern_freq(rex_str <- "(?<!( b| B| c| C| g| G| j| M| p| P| w| W| r| Z|\\(b|ar|bo|Bo|co|Co|Ew|gk|go|ho|ig|jo|kb|ke|Ke|ki|lo|Lo|mo|mt|no|No|po|ra|ro|sm|Sm|Sp|to|To))ok(?!(ay|bo|e |e\\)|e,|e\\.|eb|ed|el|en|er|es|ey|i |ie|in|it|ka|ke|ki|ly|on|oy|ra|st|u |uc|uy|yl|yo))",
ignore.case=FALSE))
match_df <- dsp_pattern(rex_str, ignore.case=FALSE, print.all=FALSE)
for (row in row.names(match_df))
dsp_matches(rex_str, ix=as.numeric(row))
}
# txt_vctr <- glb_txt_chr_lst[[glbFeatsText[1]]]
# print(chk_pattern_freq(rex_str <- "(?<!( b| c| C| p|\\(b|bo|co|lo|Lo|Sp|to|To))ok(?!(ay|e |e\\)|e,|e\\.|ed|el|en|es|ey|ie|in|on|ra))", ignore.case=FALSE))
# print(chk_pattern_freq(rex_str <- "ok(?!(ay|el|on|ra))", ignore.case=FALSE))
# dsp_pattern(rex_str, ignore.case=FALSE, print.all=FALSE)
# dsp_matches(rex_str, ix=8)
# substr(txt_vctr[86], 5613, 5620)
# substr(glb_allobs_df[301, "review"], 550, 650)
#stop(here"); sav_txt_lst <- glb_txt_chr_lst
for (txt_var in glbFeatsText) {
print(sprintf("Remaining Acronyms in %s:", txt_var))
txt_vctr <- glb_txt_chr_lst[[txt_var]]
print(chk_pattern_freq(rex_str <- "([[:upper:]]\\.( *)){2,}", ignore.case=FALSE))
# Check for names
print(subset(chk_pattern_freq(rex_str <- "(([[:upper:]]+)\\.( *)){1}",
ignore.case=FALSE),
.n > 1))
# dsp_pattern(rex_str="(OK\\.( *)){1}", ignore.case=FALSE)
# dsp_matches(rex_str="(OK\\.( *)){1}", ix=557)
#dsp_matches(rex_str="\\bR\\.I\\.P(\\.*)(\\B)", ix=461)
#dsp_matches(rex_str="\\bR\\.I\\.P(\\.*)", ix=461)
#print(str_sub(txt_vctr[676], 10100, 10200))
#print(str_sub(txt_vctr[74], 1, -1))
}
for (txt_var in glbFeatsText) {
re_str <- "\\b(Fort|Ft\\.|Hong|Las|Los|New|Puerto|Saint|San|St\\.)( |-)(\\w)+"
print(sprintf("Remaining #%s# terms in %s: ", re_str, txt_var))
txt_vctr <- glb_txt_chr_lst[[txt_var]]
print(orderBy(~ -.n +pattern, subset(chk_pattern_freq(re_str, ignore.case=FALSE),
grepl("( |-)[[:upper:]]", pattern))))
print(" consider cleaning if relevant to problem domain; geography name; .n > 1")
#grep("New G", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("St\\. Wins", txt_vctr, value=TRUE, ignore.case=FALSE)
}
#stop(here"); sav_txt_lst <- glb_txt_chr_lst
for (txt_var in glbFeatsText) {
re_str <- "\\b(N|S|E|W|C)( |\\.)(\\w)+"
print(sprintf("Remaining #%s# terms in %s: ", re_str, txt_var))
txt_vctr <- glb_txt_chr_lst[[txt_var]]
print(orderBy(~ -.n +pattern, subset(chk_pattern_freq(re_str, ignore.case=FALSE),
grepl(".", pattern))))
#grep("N Weaver", txt_vctr, value=TRUE, ignore.case=FALSE)
}
for (txt_var in glbFeatsText) {
re_str <- "\\b(North|South|East|West|Central)( |\\.)(\\w)+"
print(sprintf("Remaining #%s# terms in %s: ", re_str, txt_var))
txt_vctr <- glb_txt_chr_lst[[txt_var]]
if (nrow(filtered_df <- subset(chk_pattern_freq(re_str, ignore.case=FALSE),
grepl(".", pattern))) > 0)
print(orderBy(~ -.n +pattern, filtered_df))
#grep("Central (African|Bankers|Cast|Italy|Role|Spring)", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("East (Africa|Berlin|London|Poland|Rivals|Spring)", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("North (American|Korean|West)", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("South (Pacific|Street)", txt_vctr, value=TRUE, ignore.case=FALSE)
#grep("St\\. Martins", txt_vctr, value=TRUE, ignore.case=FALSE)
}
find_cmpnd_wrds <- function(txt_vctr) {
# Enhancements:
# - arg should be txt_corpus instead of txt_vctr
txt_corpus <- Corpus(VectorSource(txt_vctr))
txt_corpus <- tm_map(txt_corpus, content_transformer(tolower), lazy = TRUE)
txt_corpus <- tm_map(txt_corpus, PlainTextDocument, lazy = TRUE)
txt_corpus <- tm_map(txt_corpus, removePunctuation,
preserve_intra_word_dashes = TRUE, lazy = FALSE)
# Defaulting to Tf since TfIdf with normalize = TRUE throws a warning for empty docs
terms_mtrx <- as.matrix(TermDocumentMatrix(txt_corpus,
control = list(weighting = weightTf)))
terms_df <- orderBy(~ -Tf, data.frame(term = dimnames(terms_mtrx)$Terms,
Tf = rowSums(terms_mtrx)))
cmpnd_df <- subset(terms_df, grepl("-", term))
if (nrow(cmpnd_df) == 0) {
print(" No compounded terms found")
return(FALSE)
}
txt_compound_filename <- paste0(glb_txt_munge_filenames_pfx, "compound.csv")
if (!file.exists(txt_compound_filename))
stop(txt_compound_filename, " not found!")
filter_df <- read.csv(txt_compound_filename, comment.char="#", strip.white=TRUE)
cmpnd_df$filter <- FALSE
for (row_ix in 1:nrow(filter_df))
cmpnd_df[!cmpnd_df$filter, "filter"] <-
grepl(filter_df[row_ix, "rex_str"],
cmpnd_df[!cmpnd_df$filter, "term"], ignore.case=TRUE)
cmpnd_df <- subset(cmpnd_df, !filter)
# Bug in tm_map(txt_corpus, removePunctuation, preserve_intra_word_dashes=TRUE) ???
# "net-a-porter" gets converted to "net-aporter"
#grep("net-a-porter", txt_vctr, ignore.case=TRUE, value=TRUE)
#grep("maser-laser", txt_vctr, ignore.case=TRUE, value=TRUE)
#txt_corpus[[which(grepl("net-a-porter", txt_vctr, ignore.case=TRUE))]]
#grep("\\b(across|longer)-(\\w)", cmpnd_df$term, ignore.case=TRUE, value=TRUE)
#grep("(\\w)-(affected|term)\\b", cmpnd_df$term, ignore.case=TRUE, value=TRUE)
print(sprintf("nrow(cmpnd_df): %d", nrow(cmpnd_df)))
myprint_df(cmpnd_df)
}
# This should be run after glb_txt_corpus_lst is created with tolower
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "process.text_reporting_compound_terms"), major.inc=FALSE)
# for (txt_var in glbFeatsText) {
# print(sprintf("Remaining compound terms in %s: ", txt_var))
# find_cmpnd_wrds(txt_vctr = glb_txt_chr_lst[[txt_var]])
# #grep("thirty-five", txt_vctr, ignore.case=TRUE, value=TRUE)
# #rex_str <- glb_txt_map_df[grepl("hirty", glb_txt_map_df$rex_str), "rex_str"]
# }
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "build.corpus"), major.inc=TRUE)
get_txt_terms <- function(terms_TDM) {
terms_mtrx <- as.matrix(as.TermDocumentMatrix(terms_TDM))
docms_mtrx <- as.matrix(as.DocumentTermMatrix(terms_TDM))
terms_df <- data.frame(term = dimnames(terms_mtrx)$Terms,
weight = rowSums(terms_mtrx),
freq = rowSums(terms_mtrx > 0))
terms_df$pos <- 1:nrow(terms_df)
terms_df$cor.y <-
cor(docms_mtrx[glb_allobs_df$.src == "Train",],
as.numeric(glb_allobs_df[glb_allobs_df$.src == "Train", glb_rsp_var]),
use = "pairwise.complete.obs")
terms_df$cor.y.abs <- abs(terms_df$cor.y)
# .rnorm.cor.y.abs <- abs(cor(glb_allobs_df[glb_allobs_df$.src == "Train", ".rnorm"],
# as.numeric(glb_allobs_df[glb_allobs_df$.src == "Train", glb_rsp_var]),
# use = "pairwise.complete.obs"))
terms_df$chisq.stat <- NA; terms_df$chisq.pval <- NA
for (ix in 1:nrow(terms_df)) {
#for (ix in 1:743) {
if (length(unique(docms_mtrx[glb_allobs_df$.src == "Train", ix])) > 1) {
chisq <- chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
glb_allobs_df[glb_allobs_df$.src == "Train", glb_rsp_var])
terms_df[ix, "chisq.stat"] <- chisq$statistic
terms_df[ix, "chisq.pval"] <- chisq$p.value
}
}
nzv_df <- nzv(docms_mtrx[glb_allobs_df$.src == "Train",], freqCut = glb_nzv_freqCut,
uniqueCut = glb_nzv_uniqueCut, saveMetrics = TRUE)
terms_df$nzv.freqRatio <- nzv_df$freqRatio
# terms_df$nzv.freqRatio.cut.fctr <- cut(terms_df$nzv.freqRatio,
# breaks = sort(c(min(terms_df$nzv.freqRatio),
# glb_nzv_freqCut,
# max(terms_df$nzv.freqRatio))))
terms_df$nzv.percentUnique <- nzv_df$percentUnique
# terms_df$nzv.percentUnique.cut.fctr <- cut(terms_df$nzv.percentUnique,
# breaks = sort(c(min(terms_df$nzv.percentUnique) - .Machine$double.neg.eps,
# glb_nzv_uniqueCut,
# max(terms_df$nzv.percentUnique))))
# terms_df$nzv.quad.fctr <- as.factor(paste0("fRatio:", terms_df$nzv.freqRatio.cut.fctr,
# "\n%Unq:", terms_df$nzv.percentUnique.cut.fctr))
terms_df$nzv <- nzv_df$nzv
for (cls in unique(glb_allobs_df[, glb_txt_cor_var])) {
if (!is.na(cls))
terms_df[, paste0("weight.", as.character(cls))] <-
colSums(t(terms_mtrx) *
as.numeric(!is.na(glb_allobs_df[, glb_txt_cor_var]) &
(glb_allobs_df[, glb_txt_cor_var] == cls))) else
terms_df[, paste0("weight.", as.character(cls))] <-
colSums(t(terms_mtrx) *
as.numeric(is.na(glb_allobs_df[, glb_txt_cor_var])))
}
# Check all calls to get_terms_DTM_terms to change returned order assumption
return(terms_df <- orderBy(~ -weight, terms_df))
}
#plt_full_df <- get_terms_DTM_terms(terms_DTM=glb_full_terms_DTM_lst[[txt_var]])
get_corpus_terms <- function(txt_corpus) {
return(terms_df <- get_txt_terms(terms_TDM =
TermDocumentMatrix(txt_corpus, control = glb_txt_terms_control)))
}
myreplacePunctuation <- function(x) {
return(gsub("[[:punct:]]+", " ", gsub("('|-)", "", x)))
}
#stop(here"); glb_to_sav()
glb_txt_corpus_lst <- list()
print(sprintf("Building glb_txt_corpus_lst..."))
glb_txt_corpus_lst <- foreach(txt_var = glbFeatsText, .verbose = FALSE) %dopar% {
#glb_txt_corpus_lst <- foreach(txt_var = glbFeatsText, .verbose = TRUE) %do% {
#for (txt_var in glbFeatsText) {
txt_corpus <- Corpus(VectorSource(glb_txt_chr_lst[[txt_var]]))
txt_corpus <- tm_map(txt_corpus, PlainTextDocument, lazy = TRUE)
txt_corpus <- tm_map(txt_corpus, content_transformer(tolower), lazy = TRUE) #nuppr
# removePunctuation does not replace with whitespace. Use a custom transformer ???
#txt_corpus <- tm_map(txt_corpus, removePunctuation, lazy = TRUE) #npnct<chr_ix>
txt_corpus <- tm_map(txt_corpus, content_transformer(myreplacePunctuation)
, lazy = TRUE) #npnct<chr_ix>
# txt-corpus <- tm_map(txt_corpus, content_transformer(function(x, pattern) gsub(pattern, "", x))
if (!is.null(glb_txt_stop_words[[txt_var]]))
txt_corpus <- tm_map(txt_corpus, removeWords, glb_txt_stop_words[[txt_var]],
lazy = FALSE)#, lazy=TRUE) #nstopwrds
# foreach result is based on .Last.Eval
txt_corpus <- txt_corpus
# glb_txt_corpus_lst[[txt_var]] <- txt_corpus
}
names(glb_txt_corpus_lst) <- glbFeatsText
mycombineSynonyms <- content_transformer(function(x, syn=NULL) {
Reduce(function(a,b) {
gsub(paste0("\\b(", paste(b$syns, collapse = "|"),")\\b"), b$word, a)}, syn, x)
})
#stop(here"); glb_to_sav(); sav_txt_corpus <- glb_txt_corpus_lst[[txt_var]]; all.equal(sav_txt_corpus, glb_txt_corpus_lst[[txt_var]]); glb_txt_corpus_lst[[txt_var]] <- sav_txt_corpus
glb_post_stop_words_terms_df_lst <- list();
glb_post_stop_words_terms_mtrx_lst <- list();
glb_post_stem_words_terms_df_lst <- list();
glb_post_stem_words_terms_mtrx_lst <- list();
for (txt_var in glbFeatsText) {
print(sprintf(" Top_n post-stop term weights for %s:", txt_var))
# This impacts stemming probably due to lazy parameter
print(myprint_df(full_terms_df <-
get_corpus_terms(txt_corpus=glb_txt_corpus_lst[[txt_var]]),
glbFeatsTextTermsMax[[txt_var]]))
glb_post_stop_words_terms_df_lst[[txt_var]] <- full_terms_df
terms_stop_mtrx <- as.matrix(DocumentTermMatrix(glb_txt_corpus_lst[[txt_var]],
control=glb_txt_terms_control))
rownames(terms_stop_mtrx) <- rownames(glb_allobs_df) # print undreadable otherwise
glb_post_stop_words_terms_mtrx_lst[[txt_var]] <- terms_stop_mtrx
tmp_allobs_df <- glb_allobs_df[, c(glb_id_var, glb_rsp_var)]
tmp_allobs_df$terms.post.stop.n <- rowSums(terms_stop_mtrx > 0)
tmp_allobs_df$terms.post.stop.n.log <- log(1 + tmp_allobs_df$terms.post.stop.n)
tmp_allobs_df$weight.post.stop.sum <- rowSums(terms_stop_mtrx)
print(sprintf(" Top_n stem term weights for %s:", txt_var))
glb_txt_corpus_lst[[txt_var]] <- tm_map(glb_txt_corpus_lst[[txt_var]], stemDocument,
"english", lazy=FALSE)
if (!is.null(glb_txt_synonyms[[txt_var]])) {
syn_lst <- myrmNullObj(glb_txt_synonyms[[txt_var]])
glb_txt_corpus_lst[[txt_var]] <- tm_map(glb_txt_corpus_lst[[txt_var]],
mycombineSynonyms,
syn_lst, lazy=FALSE)
}
print(myprint_df(full_terms_df <- get_corpus_terms(glb_txt_corpus_lst[[txt_var]]),
glbFeatsTextTermsMax[[txt_var]]))
glb_post_stem_words_terms_df_lst[[txt_var]] <- full_terms_df
terms_stem_mtrx <- as.matrix(DocumentTermMatrix(glb_txt_corpus_lst[[txt_var]],
control=glb_txt_terms_control))
rownames(terms_stem_mtrx) <- rownames(glb_allobs_df) # print undreadable otherwise
glb_post_stem_words_terms_mtrx_lst[[txt_var]] <- terms_stem_mtrx
tmp_allobs_df$terms.post.stem.n <- rowSums(terms_stem_mtrx > 0)
tmp_allobs_df$terms.post.stem.n.log <- log(1 + tmp_allobs_df$terms.post.stem.n)
tmp_allobs_df$weight.post.stem.sum <- rowSums(terms_stem_mtrx)
tmp_allobs_df$terms.n.stem.stop.Ratio <-
1.0 * tmp_allobs_df$terms.post.stem.n / tmp_allobs_df$terms.post.stop.n
tmp_allobs_df[(is.nan(tmp_allobs_df$terms.n.stem.stop.Ratio) |
is.infinite(tmp_allobs_df$terms.n.stem.stop.Ratio)),
"terms.n.stem.stop.Ratio"] <- 1.0
if ((n.errors <- sum(tmp_allobs_df$terms.n.stem.stop.Ratio > 1)) > 0)
stop(n.errors, " obs in tmp_allobs_df have terms.n.stem.stop.Ratio > 1",
" happening due to terms filtered by glb_txt_terms_control$bounds$global[1] but stemmable to other terms")
#print(head(subset(tmp_allobs_df, terms.n.stem.stop.Ratio > 1)))
#glb_allobs_df[(row_ix <- which(glb_allobs_df$UniqueID == 10465)), ]
#terms_stop_mtrx[row_ix, terms_stop_mtrx[row_ix, ] > 0]
#setdiff(names(terms_stem_mtrx[row_ix, terms_stem_mtrx[row_ix, ] > 0]), names(terms_stop_mtrx[row_ix, terms_stop_mtrx[row_ix, ] > 0]))
#mydsp_obs(list(descr.my.contains="updat"))
tmp_allobs_df$weight.sum.stem.stop.Ratio <-
1.0 * tmp_allobs_df$weight.post.stem.sum / tmp_allobs_df$weight.post.stop.sum
tmp_allobs_df[is.nan(tmp_allobs_df$weight.sum.stem.stop.Ratio) |
is.infinite(tmp_allobs_df$weight.sum.stem.stop.Ratio),
"weight.sum.stem.stop.Ratio"] <- 1.0
tmp_trnobs_df <- tmp_allobs_df[!is.na(tmp_allobs_df[, glb_rsp_var]), ]
print(cor(as.matrix(tmp_trnobs_df[, -c(1, 2)]),
as.numeric(tmp_trnobs_df[, glb_rsp_var])))
txt_var_pfx <- toupper(substr(txt_var, 1, 1))
tmp_allobs_df <- tmp_allobs_df[, -c(1, 2)]
names(tmp_allobs_df) <- paste(paste0(txt_var_pfx, "."), names(tmp_allobs_df), sep = "")
glb_allobs_df <- cbind(glb_allobs_df, tmp_allobs_df)
glbFeatsExclude <- c(glbFeatsExclude,
paste(paste0(txt_var_pfx, ".terms.post."), c("stop.n", "stem.n"), sep = ""))
}
require(wordcloud)
for (txt_var in glbFeatsText) {
print(sprintf(" Wordcloud post-stem term weights for %s:", txt_var))
m <- glb_post_stem_words_terms_mtrx_lst[[txt_var]]
# calculate the frequency of words
v <- sort(colSums(m), decreasing = TRUE)
myNames <- names(v)
d <- data.frame(word = myNames, freq = v)
wordcloud(d$word, d$freq, min.freq = glb_txt_terms_control$bounds$global[1])
}
for (txt_var in glbFeatsText) {
.rnorm.cor.y.abs <- abs(cor(glb_allobs_df[glb_allobs_df$.src == "Train", ".rnorm"],
as.numeric(glb_allobs_df[glb_allobs_df$.src == "Train", glb_rsp_var]),
use = "pairwise.complete.obs"))
plt_df <- subset(glb_post_stem_words_terms_df_lst[[txt_var]], !is.na(cor.y))
plt_df$nzv.freqRatio.cut.fctr <- cut(plt_df$nzv.freqRatio,
breaks = sort(c(min(plt_df$nzv.freqRatio),
glb_nzv_freqCut,
max(plt_df$nzv.freqRatio))))
plt_df$nzv.percentUnique.cut.fctr <- cut(plt_df$nzv.percentUnique,
breaks = sort(c(min(plt_df$nzv.percentUnique) - .Machine$double.neg.eps,
glb_nzv_uniqueCut,
max(plt_df$nzv.percentUnique))))
plt_df$nzv.quad.fctr <- as.factor(paste0("fRatio:", plt_df$nzv.freqRatio.cut.fctr,
"\n%Unq:", plt_df$nzv.percentUnique.cut.fctr))
labelCnd <- !plt_df$nzv |
(!is.na(plt_df$chisq.pval) & (plt_df$chisq.pval < 0.05)) &
(!is.na(plt_df$cor.y.abs) & (plt_df$cor.y.abs > .rnorm.cor.y.abs))
plt_df$label <- NA; plt_df[labelCnd, "label"] <- plt_df[labelCnd, "term"]
print(ggplot(plt_df, aes(x = cor.y, y = chisq.stat)) +
geom_point(aes(color = nzv.quad.fctr, size = weight)) +
geom_text(aes(label = label), color = "gray50") +
# geom_vline(xintercept = 0) +
geom_vline(xintercept = c(-1, +1) * .rnorm.cor.y.abs, color = "gray",
linetype = "dashed") +
ggtitle(txt_var))
}
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "extract.DTM"), major.inc=TRUE)
#stop(here")
glb_full_DTM_lst <- list(); glb_sprs_DTM_lst <- list();
for (txt_var in glbFeatsText) {
print(sprintf("Extracting term weights for %s...", txt_var))
txt_corpus <- glb_txt_corpus_lst[[txt_var]]
full_DTM <- DocumentTermMatrix(txt_corpus,
control=glb_txt_terms_control)
sprs_DTM <- removeSparseTerms(full_DTM,
glb_sprs_thresholds[txt_var])
glb_full_DTM_lst[[txt_var]] <- full_DTM
glb_sprs_DTM_lst[[txt_var]] <- sprs_DTM
}
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "report.DTM"), major.inc=TRUE)
require(reshape2)
for (txt_var in glbFeatsText) {
print(sprintf("Reporting term weights for %s...", txt_var))
full_DTM <- glb_full_DTM_lst[[txt_var]]
sprs_DTM <- glb_sprs_DTM_lst[[txt_var]]
print(" Full TermMatrix:"); print(full_DTM)
full_terms_df <- get_txt_terms(full_DTM)
# full_terms_df <- full_terms_df[, c(2, 1, 3, 4)]
# col_names <- names(full_terms_df)
# col_names[2:length(col_names)] <-
# paste(col_names[2:length(col_names)], ".full", sep="")
# names(full_terms_df) <- col_names
print(" Sparse TermMatrix:"); print(sprs_DTM)
sprs_terms_df <- get_txt_terms(sprs_DTM)
# sprs_terms_df <- sprs_terms_df[, c(2, 1, 3, 4)]
# col_names <- names(sprs_terms_df)
# col_names[2:length(col_names)] <-
# paste(col_names[2:length(col_names)], ".sprs", sep="")
# names(sprs_terms_df) <- col_names
#intersect(names(full_terms_df), names(sprs_terms_df))
terms_df <- merge(full_terms_df, sprs_terms_df, by = c("term", "weight", "freq",
grep("weight\\.", names(full_terms_df), value = TRUE)),
all.x = TRUE, suffixes = c(".full", ".sprs"))
terms_df$in.sprs <- !is.na(terms_df$pos.sprs)
plt_terms_df <- subset(terms_df,
weight >= min(terms_df$weight[!is.na(terms_df$pos.sprs)], na.rm=TRUE))
plt_terms_df$label <- ""
plt_terms_df[is.na(plt_terms_df$pos.sprs), "label"] <-
plt_terms_df[is.na(plt_terms_df$pos.sprs), "term"]
# glb_important_terms[[txt_var]] <- union(glb_important_terms[[txt_var]],
# plt_terms_df[is.na(plt_terms_df$TfIdf.sprs), "term"])
print(myplot_scatter(plt_terms_df, "freq", "weight",
colorcol_name="in.sprs") +
geom_text(aes(label=label), color="Black", size=3.5))
melt_terms_df <- orderBy(~ -value,
melt(terms_df, id.vars="term", measure.vars = c("weight", "freq")))
print(ggplot(melt_terms_df, aes(value, color=variable)) + stat_ecdf() +
geom_hline(yintercept=glb_sprs_thresholds[txt_var],
linetype = "dotted"))
melt_terms_df <- orderBy(~ -value,
melt(subset(terms_df, in.sprs), id.vars="term",
measure.vars=grep("weight.", names(terms_df), value=TRUE)))
print(myplot_hbar(melt_terms_df, "term", "value", colorcol_name="variable"))
melt_terms_df <- orderBy(~ -value,
melt(subset(terms_df, !in.sprs), id.vars="term",
measure.vars=grep("weight.", names(terms_df), value=TRUE)))
print(myplot_hbar(head(melt_terms_df, glbFeatsTextTermsMax[[txt_var]]), "term", "value",
colorcol_name="variable"))
}
# sav_full_DTM_lst <- glb_full_DTM_lst
# print(identical(sav_glb_txt_corpus_lst, glb_txt_corpus_lst))
# print(all.equal(length(sav_glb_txt_corpus_lst), length(glb_txt_corpus_lst)))
# print(all.equal(names(sav_glb_txt_corpus_lst), names(glb_txt_corpus_lst)))
# print(all.equal(sav_glb_txt_corpus_lst[["Headline"]], glb_txt_corpus_lst[["Headline"]]))
# print(identical(sav_full_DTM_lst, glb_full_DTM_lst))
rm(full_terms_mtrx)
# Create txt features
if ((length(glbFeatsText) > 1) &&
(length(unique(pfxs <- sapply(glbFeatsText,
function(txt) toupper(substr(txt, 1, 1))))) < length(glbFeatsText)))
stop("Prefixes for corpus freq terms not unique: ", pfxs)
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "bind.DTM"),
major.inc=TRUE)
#stop(here"); glb_to_sav(); all.equal(sav_allobs_df, glb_allobs_df); glb_allobs_df <- sav_allobs_df
require(tidyr)
for (txt_var in glbFeatsText) {
print(sprintf("Binding DTM for %s...", txt_var))
txt_var_pfx <- toupper(substr(txt_var, 1, 1))
txt_full_X_df <- as.data.frame(as.matrix(glb_full_DTM_lst[[txt_var]]))
terms_full_df <- get_txt_terms(glb_full_DTM_lst[[txt_var]])
# make.names adds a period to R keywords e.g. "in", "function"
colnames(txt_full_X_df) <- paste(txt_var_pfx, ".T.",
make.names(colnames(txt_full_X_df)), sep="")
rownames(txt_full_X_df) <- rownames(glb_allobs_df) # warning otherwise
# plt_full_df <- terms_full_df
# names(plt_full_df)[grepl("weight$", names(plt_full_df))] <- "weight.all"
# # gather(plt_full_df[1:5, ], domain, TfIdf, -matches("!(TfIdf)"))
# # gather(plt_full_df[1:5, grepl("TfIdf", names(plt_full_df))], domain, TfIdf)
# # gather(plt_full_df[1:5, ], domain, TfIdf,
# # -names(plt_full_df)[!grepl("TfIdf", names(plt_full_df))])
# plt_full_df <- gather(plt_full_df, domain, weight,
# -c(term, freq, pos, cor.y, cor.y.abs))
# plt_full_df$label <- NA
# top_val_terms <- orderBy(~-weight, terms_full_df)$term[1:glbFeatsTextTermsMax[[txt_var]]]
# plt_full_df[plt_full_df$term %in% top_val_terms, "label"] <-
# plt_full_df[plt_full_df$term %in% top_val_terms, "term"]
# top_cor_terms <- orderBy(~-cor.y.abs,
# terms_full_df)$term[1:glbFeatsTextTermsMax[[txt_var]]]
# plt_full_df[plt_full_df$term %in% top_cor_terms, "label"] <-
# plt_full_df[plt_full_df$term %in% top_cor_terms, "term"]
# #plt_full_df$type <- "none"
# plt_full_df[plt_full_df$term %in% top_val_terms, "type"] <- "top.weight"
# plt_full_df[plt_full_df$term %in% top_cor_terms, "type"] <- "top.cor"
# plt_full_df[plt_full_df$term %in% intersect(top_val_terms, top_cor_terms), "type"] <-
# "top.both"
# cor.y.rnorm <- cor(glb_allobs_df$.rnorm, as.numeric(glb_allobs_df[, glb_rsp_var]),
# use = "pairwise.complete.obs")
# print(ggplot(plt_full_df, aes(x=weight, y=cor.y)) + facet_wrap(~ domain) +
# geom_point(aes(size=freq), color="grey") +
# geom_jitter() +
# geom_text(aes(label=label, color=type), size=3.5) +
# #geom_hline(yintercept=cor.y.rnorm, color="red") +
# geom_hline(yintercept=c(cor.y.rnorm, -cor.y.rnorm), color="red"))
#stop(here"); glb_to_sav()
if (glbFeatsTextFilter == "sparse") {
txt_X_df <- as.data.frame(as.matrix(glb_sprs_DTM_lst[[txt_var]]))
select_terms <- make.names(colnames(txt_X_df))
# colnames(txt_X_df) <- paste(txt_var_pfx, ".T.",
# make.names(colnames(txt_X_df)), sep="")
# rownames(txt_X_df) <- rownames(glb_allobs_df) # warning otherwise
} else if (glbFeatsTextFilter == "top.val") {
select_terms <- orderBy(~-weight,
terms_full_df)$term[1:glbFeatsTextTermsMax[[txt_var]]]
# txt_X_df <- txt_full_X_df[, subset(terms_full_df, term %in% select_terms)$pos,
# FALSE]
} else if (glbFeatsTextFilter == "top.cor") {
select_terms <- orderBy(~-cor.y.abs,
terms_full_df)$term[1:glbFeatsTextTermsMax[[txt_var]]]
# txt_X_df <- txt_full_X_df[, subset(terms_full_df, term %in% select_terms)$pos,
# FALSE]
} else if (glbFeatsTextFilter == "top.chisq") {
select_terms <- orderBy(~-chisq.stat,
subset(terms_full_df, chisq.pval < 0.05)
)$term[1:glbFeatsTextTermsMax[[txt_var]]]
} else if (glbFeatsTextFilter == "union.top.val.cor") {
select_terms <- union(
orderBy(~-weight , terms_full_df)$term[1:glbFeatsTextTermsMax[[txt_var]]],
orderBy(~-cor.y.abs, terms_full_df)$term[1:glbFeatsTextTermsMax[[txt_var]]])
} else stop(
"glbFeatsTextFilter should be one of c('sparse', 'top.val', 'top.cor', 'union.top.val.cor', 'top.chisq') vs. '",
glbFeatsTextFilter, "'")
assoc_terms_lst <- findAssocs(glb_full_DTM_lst[[txt_var]], select_terms,
glbFeatsTextAssocCor[[txt_var]])
assoc_terms <- c(NULL)
for (term in names(assoc_terms_lst))
if (length(assoc_terms_lst[[term]]) > 0)
assoc_terms <- union(assoc_terms, names(assoc_terms_lst[[term]]))
#stop(here"); glb_to_sav()
txt_X_df <- txt_full_X_df[,
subset(terms_full_df, term %in% c(select_terms, assoc_terms))$pos,
FALSE]
glb_allobs_df <- cbind(glb_allobs_df, txt_X_df) # TfIdf is normalized
#glb_allobs_df <- cbind(glb_allobs_df, log_X_df) # if using non-normalized metrics
}
#identical(chk_entity_df, glb_allobs_df)
#chk_entity_df <- glb_allobs_df
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df,
paste0("extract.features_", "bind.DXM"),
major.inc=TRUE)
#stop(here"); sav_allobs_df <- glb_allobs_df; glb_allobs_df <- sav_allobs_df
glb_punct_vctr <- c("!", "\"", "#", "\\$", "%", "&", "'",
"\\(|\\)",# "\\(", "\\)",
"\\*", "\\+", ",", "-", "\\.", "/", ":", ";",
"<|>", # "<",
"=",
# ">",
"\\?", "@", "\\[", "\\\\", "\\]", "\\^", "_", "`",
"\\{", "\\|", "\\}", "~")
txt_X_df <- glb_allobs_df[, c(glb_id_var, ".rnorm"), FALSE]
txt_X_df <- foreach(txt_var=glbFeatsText, .combine=cbind) %dopar% {
#for (txt_var in glbFeatsText) {
print(sprintf("Binding DXM for %s...", txt_var))
txt_var_pfx <- toupper(substr(txt_var, 1, 1))
txt_full_DTM_mtrx <- as.matrix(glb_full_DTM_lst[[txt_var]])
rownames(txt_full_DTM_mtrx) <- rownames(glb_allobs_df) # print undreadable otherwise
#print(txt_full_DTM_mtrx[txt_full_DTM_mtrx[, "ebola"] != 0, "ebola"])
# Create <txt_var>.T.<term> for glb_important_terms
for (term in glb_important_terms[[txt_var]])
txt_X_df[, paste0(txt_var_pfx, ".T.", make.names(term))] <-
txt_full_DTM_mtrx[, term]
# Create <txt_var>.wrds.n.log & .wrds.unq.n.log
txt_X_df[, paste0(txt_var_pfx, ".wrds.n.log")] <-
log(1 + mycount_pattern_occ("\\w+", glb_txt_chr_lst[[txt_var]]))
txt_X_df[, paste0(txt_var_pfx, ".wrds.unq.n.log")] <-
log(1 + rowSums(txt_full_DTM_mtrx != 0))
txt_X_df[, paste0(txt_var_pfx, ".weight.sum")] <-
rowSums(txt_full_DTM_mtrx)
txt_X_df[, paste0(txt_var_pfx, ".ratio.weight.sum.wrds.n")] <-
txt_X_df[, paste0(txt_var_pfx, ".weight.sum")] /
(exp(txt_X_df[, paste0(txt_var_pfx, ".wrds.n.log")]) - 1)
txt_X_df[is.nan(txt_X_df[, paste0(txt_var_pfx, ".ratio.weight.sum.wrds.n")]),
paste0(txt_var_pfx, ".ratio.weight.sum.wrds.n")] <- 0
# Create <txt_var>.chrs.n.log
txt_X_df[, paste0(txt_var_pfx, ".chrs.n.log")] <-
log(1 + mycount_pattern_occ(".", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".chrs.uppr.n.log")] <-
log(1 + mycount_pattern_occ("[[:upper:]]", glb_allobs_df[, txt_var]))
txt_X_df[, paste0(txt_var_pfx, ".dgts.n.log")] <-
log(1 + mycount_pattern_occ("[[:digit:]]", glb_allobs_df[, txt_var]))
# Create <txt_var>.npnct?.log
# would this be faster if it's iterated over each row instead of
# each created column ???
for (punct_ix in 1:length(glb_punct_vctr)) {
# smp0 <- " "
# smp1 <- "! \" # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~"
# smp2 <- paste(smp1, smp1, sep=" ")
# print(sprintf("Testing %s pattern:", glb_punct_vctr[punct_ix]))
# results <- mycount_pattern_occ(glb_punct_vctr[punct_ix], c(smp0, smp1, smp2))
# names(results) <- NULL; print(results)
txt_X_df[,
paste0(txt_var_pfx, ".chrs.pnct", sprintf("%02d", punct_ix), ".n.log")] <-
log(1 + mycount_pattern_occ(glb_punct_vctr[punct_ix],
glb_allobs_df[, txt_var]))
}
# print(head(glb_allobs_df[glb_allobs_df[, "A.npnct23.log"] > 0,
# c("UniqueID", "Popular", "Abstract", "A.npnct23.log")]))
# Create <txt_var>.wrds.stop.n.log & <txt_var>ratio.wrds.stop.n.wrds.n
if (!is.null(glb_txt_stop_words[[txt_var]])) {
stop_words_rex_str <- paste0("\\b(",
paste0(glb_txt_stop_words[[txt_var]], collapse="|"),
")\\b")
txt_X_df[, paste0(txt_var_pfx, ".wrds.stop.n", ".log")] <-
log(1 + mycount_pattern_occ(stop_words_rex_str, glb_txt_chr_lst[[txt_var]]))
txt_X_df[, paste0(txt_var_pfx, ".ratio.wrds.stop.n.wrds.n")] <-
exp(txt_X_df[, paste0(txt_var_pfx, ".wrds.stop.n", ".log")] -
txt_X_df[, paste0(txt_var_pfx, ".wrds.n", ".log")])
}
# Create <txt_var>.P.http
txt_X_df[, paste(txt_var_pfx, ".P.http", sep="")] <-
as.integer(0 + mycount_pattern_occ("http", glb_allobs_df[, txt_var]))
# Create <txt_var>.P.mini & air
txt_X_df[, paste(txt_var_pfx, ".P.mini", sep="")] <-
as.integer(0 + mycount_pattern_occ("mini(?!m)", glb_allobs_df[, txt_var],
perl=TRUE))
txt_X_df[, paste(txt_var_pfx, ".P.air", sep="")] <-
as.integer(0 + mycount_pattern_occ("(?<![fhp])air", glb_allobs_df[, txt_var],
perl=TRUE))
txt_X_df[, paste(txt_var_pfx, ".P.black", sep="")] <-
as.integer(0 + mycount_pattern_occ("black", glb_allobs_df[, txt_var],
perl=TRUE))
txt_X_df[, paste(txt_var_pfx, ".P.white", sep="")] <-
as.integer(0 + mycount_pattern_occ("white", glb_allobs_df[, txt_var],
perl=TRUE))
txt_X_df[, paste(txt_var_pfx, ".P.gold", sep="")] <-
as.integer(0 + mycount_pattern_occ("gold", glb_allobs_df[, txt_var],
perl=TRUE))
txt_X_df[, paste(txt_var_pfx, ".P.spacegray", sep="")] <-
as.integer(0 + mycount_pattern_occ("spacegray", glb_allobs_df[, txt_var],
perl=TRUE))
txt_X_df <- subset(txt_X_df, select=-.rnorm)
txt_X_df <- txt_X_df[, -grep(glb_id_var, names(txt_X_df), fixed=TRUE), FALSE]
#glb_allobs_df <- cbind(glb_allobs_df, txt_X_df)
}
glb_allobs_df <- cbind(glb_allobs_df, txt_X_df)
#myplot_box(glb_allobs_df, "A.sum.TfIdf", glb_rsp_var)
# if (sum(is.na(glb_allobs_df$D.P.http)) > 0)
# stop("Why is this happening ?")
# Generate summaries
# print(summary(glb_allobs_df))
# print(sapply(names(glb_allobs_df), function(col) sum(is.na(glb_allobs_df[, col]))))
# print(summary(glb_trnobs_df))
# print(sapply(names(glb_trnobs_df), function(col) sum(is.na(glb_trnobs_df[, col]))))
# print(summary(glb_newobs_df))
# print(sapply(names(glb_newobs_df), function(col) sum(is.na(glb_newobs_df[, col]))))
glbFeatsExclude <- union(glbFeatsExclude,
glbFeatsText)
rm(log_X_df, txt_X_df)
}
## Loading required package: stringr
## label step_major step_minor label_minor
## 2 extract.features_factorize.str.vars 2 0 0
## 3 extract.features_process.text 3 0 0
## bgn end elapsed
## 2 26.927 27.004 0.077
## 3 27.004 NA NA
## [1] "Building glb_txt_chr_lst..."
## [1] "running gsub for 10 (of 179): #\\bCentral African Republic\\b#..."
## [1] "running gsub for 20 (of 179): #\\bAlejandro G\\. Iñárritu#..."
## [1] "running gsub for 30 (of 179): #\\bC\\.A\\.A\\.#..."
## [1] "running gsub for 40 (of 179): #\\bCV\\.#..."
## [1] "running gsub for 50 (of 179): #\\bE\\.P\\.A\\.#..."
## [1] "running gsub for 60 (of 179): #\\bG\\.I\\. Joe#..."
## [1] "running gsub for 70 (of 179): #\\bISIS\\.#..."
## [1] "running gsub for 80 (of 179): #\\bJ\\.K\\. Simmons#..."
## [1] "running gsub for 90 (of 179): #\\bM\\. Henri Pol#..."
## [1] "running gsub for 100 (of 179): #\\bN\\.Y\\.S\\.E\\.#..."
## [1] "running gsub for 110 (of 179): #\\bR\\.B\\.S\\.#..."
## [1] "running gsub for 120 (of 179): #\\bSteven A\\. Cohen#..."
## [1] "running gsub for 130 (of 179): #\\bV\\.A\\.#..."
## [1] "running gsub for 140 (of 179): #\\bWall Street#..."
## [1] "running gsub for 150 (of 179): #\\bSaint( |-)((Laurent|Lucia)\\b)+#..."
## [1] "running gsub for 160 (of 179): #\\bSouth( |\\\\.)(America|American|Africa|African|Carolina|Dakota|Korea|Korean|Sudan)\\b#..."
## [1] "running gsub for 170 (of 179): #(\\w)-a-year#..."
## [1] "Remaining OK in descr.my:"
## pattern .n
## 1 OK 5
## [[1]]
## [1] NA
## attr(,"match.length")
## [1] NA
##
## [1] NA
## [[1]]
## [1] NA
## attr(,"match.length")
## [1] NA
##
## [1] NA
## [[1]]
## [1] NA
## attr(,"match.length")
## [1] NA
##
## [1] NA
## [[1]]
## [1] NA
## attr(,"match.length")
## [1] NA
##
## [1] NA
## [[1]]
## [1] NA
## attr(,"match.length")
## [1] NA
##
## [1] NA
## [1] pattern .n
## <0 rows> (or 0-length row.names)
## [1] pattern .n
## <0 rows> (or 0-length row.names)
## [1] "Remaining Acronyms in descr.my:"
## [1] pattern .n
## <0 rows> (or 0-length row.names)
## pattern .n
## 1 GB. 4
## 2 CONDITION. 3
## 3 ONLY. 3
## 4 BOX. 2
## 5 ESN. 2
## 6 GOOD. 2
## 7 IPADS. 2
## 8 TEARS. 2
## 9 WIFIONLY. 2
## [1] "Remaining #\\b(Fort|Ft\\.|Hong|Las|Los|New|Puerto|Saint|San|St\\.)( |-)(\\w)+# terms in descr.my: "
## pattern .n
## 1 New Open 3
## 3 New Digitizer 1
## 4 New Opened 1
## [1] " consider cleaning if relevant to problem domain; geography name; .n > 1"
## [1] "Remaining #\\b(N|S|E|W|C)( |\\.)(\\w)+# terms in descr.my: "
## pattern .n
## 1 C Stock 3
## [1] "Remaining #\\b(North|South|East|West|Central)( |\\.)(\\w)+# terms in descr.my: "
## label step_major
## 3 extract.features_process.text 3
## 4 extract.features_process.text_reporting_compound_terms 3
## step_minor label_minor bgn end elapsed
## 3 0 0 27.004 28.011 1.007
## 4 1 1 28.011 NA NA
## label step_major
## 4 extract.features_process.text_reporting_compound_terms 3
## 5 extract.features_build.corpus 4
## step_minor label_minor bgn end elapsed
## 4 1 1 28.011 28.016 0.005
## 5 0 0 28.017 NA NA
## [1] "Building glb_txt_corpus_lst..."
## [1] " Top_n post-stop term weights for descr.my:"
## Warning in weighting(x): empty document(s): character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) charact
## Warning in cor(docms_mtrx[glb_allobs_df$.src == "Train", ],
## as.numeric(glb_allobs_df[glb_allobs_df$.src == : the standard deviation is
## zero
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## [1] "Rows: 318; Cols: 14"
## term weight freq pos cor.y cor.y.abs
## condition condition 82.63381 328 62 -0.0193107438 0.0193107438
## likenew likenew 67.91742 46 159 -0.0249509081 0.0249509081
## in in 58.77724 291 132 -0.0750731463 0.0750731463
## is is 51.07423 217 148 0.0076303864 0.0076303864
## used used 50.85205 132 298 0.0001369895 0.0001369895
## and and 49.98953 201 19 0.0264391360 0.0264391360
## chisq.stat chisq.pval nzv.freqRatio nzv.percentUnique nzv
## condition 34.99889 0.028243004 22.37143 2.167488 TRUE
## likenew 11.19258 0.738827061 196.60000 1.576355 TRUE
## in 29.13620 0.141013206 22.55556 2.266010 TRUE
## is 50.21794 0.006114667 43.75000 2.857143 TRUE
## used 34.87666 0.014452154 41.86364 1.970443 TRUE
## and 31.35854 0.089008983 54.25000 2.266010 TRUE
## weight.N weight.Y weight.NA
## condition 45.66614 10.882849 26.08482
## likenew 36.90375 5.416672 25.59700
## in 35.19219 5.786290 17.79877
## is 25.75795 7.373010 17.94327
## used 27.35227 7.448567 16.05121
## and 27.56305 8.827670 13.59881
## term weight freq pos cor.y cor.y.abs chisq.stat
## likenew likenew 67.917419 46 159 -0.024950908 0.024950908 11.192580
## by by 9.868168 12 41 -0.023343140 0.023343140 6.132453
## just just 7.065618 13 153 0.009503472 0.009503472 9.802940
## never never 5.732774 8 183 0.027236548 0.027236548 4.765167
## mark mark 5.004191 13 170 -0.001533752 0.001533752 4.324709
## usb usb 3.270302 5 296 0.037815608 0.037815608 5.201060
## chisq.pval nzv.freqRatio nzv.percentUnique nzv weight.N
## likenew 0.7388271 196.600 1.5763547 TRUE 36.903749
## by 0.4085175 251.250 0.6896552 TRUE 8.109322
## just 0.2000184 200.800 0.7881773 TRUE 4.655125
## never 0.3122523 505.000 0.4926108 TRUE 1.952691
## mark 0.2284689 125.625 0.3940887 TRUE 2.985877
## usb 0.1576529 505.000 0.3940887 TRUE 1.847279
## weight.Y weight.NA
## likenew 5.4166717 25.5969980
## by 0.6903882 1.0684579
## just 1.5868998 0.8235932
## never 1.2481407 2.5319426
## mark 0.7815597 1.2367539
## usb 1.4230233 0.0000000
## term weight freq pos cor.y cor.y.abs chisq.stat
## photo photo 0.6992563 1 210 -0.01637606 0.01637606 4.077526e-28
## cleared cleared 0.6555528 1 59 -0.01637606 0.01637606 4.077526e-28
## devices devices 0.6555528 1 89 NA NA NA
## defect defect 0.6169908 1 81 NA NA NA
## gentle gentle 0.5827136 1 117 -0.01637606 0.01637606 4.077526e-28
## look look 0.5827136 1 165 -0.01637606 0.01637606 4.077526e-28
## chisq.pval nzv.freqRatio nzv.percentUnique nzv weight.N weight.Y
## photo 1 1014 0.19704433 TRUE 0.6992563 0
## cleared 1 1014 0.19704433 TRUE 0.6555528 0
## devices NA 0 0.09852217 TRUE 0.0000000 0
## defect NA 0 0.09852217 TRUE 0.0000000 0
## gentle 1 1014 0.19704433 TRUE 0.5827136 0
## look 1 1014 0.19704433 TRUE 0.5827136 0
## weight.NA
## photo 0.0000000
## cleared 0.0000000
## devices 0.6555528
## defect 0.6169908
## gentle 0.0000000
## look 0.0000000
## term weight freq pos cor.y cor.y.abs chisq.stat
## photo photo 0.6992563 1 210 -0.01637606 0.01637606 4.077526e-28
## cleared cleared 0.6555528 1 59 -0.01637606 0.01637606 4.077526e-28
## devices devices 0.6555528 1 89 NA NA NA
## defect defect 0.6169908 1 81 NA NA NA
## gentle gentle 0.5827136 1 117 -0.01637606 0.01637606 4.077526e-28
## look look 0.5827136 1 165 -0.01637606 0.01637606 4.077526e-28
## chisq.pval nzv.freqRatio nzv.percentUnique nzv weight.N weight.Y
## photo 1 1014 0.19704433 TRUE 0.6992563 0
## cleared 1 1014 0.19704433 TRUE 0.6555528 0
## devices NA 0 0.09852217 TRUE 0.0000000 0
## defect NA 0 0.09852217 TRUE 0.0000000 0
## gentle 1 1014 0.19704433 TRUE 0.5827136 0
## look 1 1014 0.19704433 TRUE 0.5827136 0
## weight.NA
## photo 0.0000000
## cleared 0.0000000
## devices 0.6555528
## defect 0.6169908
## gentle 0.0000000
## look 0.0000000
## Warning in weighting(x): empty document(s): character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) charact
## [1] " Top_n stem term weights for descr.my:"
## Warning in weighting(x): empty document(s): character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) charact
## Warning in cor(docms_mtrx[glb_allobs_df$.src == "Train", ],
## as.numeric(glb_allobs_df[glb_allobs_df$.src == : the standard deviation is
## zero
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## [1] "Rows: 225; Cols: 14"
## term weight freq pos cor.y cor.y.abs chisq.stat
## condit condit 82.65906 333 51 -0.013082326 0.013082326 39.43564
## likenew likenew 67.91742 46 116 -0.024950908 0.024950908 11.19258
## use use 59.22119 162 209 -0.006636197 0.006636197 32.24507
## in in 58.77724 291 97 -0.075073146 0.075073146 29.13620
## is is 51.07423 217 109 0.007630386 0.007630386 50.21794
## and and 49.98953 201 19 0.026439136 0.026439136 31.35854
## chisq.pval nzv.freqRatio nzv.percentUnique nzv weight.N
## condit 0.008703004 22.22857 2.167488 TRUE 45.49983
## likenew 0.738827061 196.60000 1.576355 TRUE 36.90375
## use 0.073312742 49.88889 2.266010 TRUE 33.13193
## in 0.141013206 22.55556 2.266010 TRUE 35.19219
## is 0.006114667 43.75000 2.857143 TRUE 25.75795
## and 0.089008983 54.25000 2.266010 TRUE 27.56305
## weight.Y weight.NA
## condit 11.341542 25.81769
## likenew 5.416672 25.59700
## use 8.493944 17.59532
## in 5.786290 17.79877
## is 7.373010 17.94327
## and 8.827670 13.59881
## term weight freq pos cor.y cor.y.abs chisq.stat
## this this 30.522817 128 200 -0.008850439 0.008850439 22.897336
## devic devic 21.897804 68 62 0.020563983 0.020563983 25.017626
## may may 11.279836 30 124 0.056996964 0.056996964 8.434436
## factori factori 10.579543 18 70 -0.024745597 0.024745597 6.407545
## had had 4.471834 10 89 0.019521472 0.019521472 8.986913
## defect defect 3.031242 5 59 0.059079189 0.059079189 7.637620
## chisq.pval nzv.freqRatio nzv.percentUnique nzv weight.N
## this 0.086344067 54.88235 1.5763547 TRUE 15.9363259
## devic 0.009062673 48.25000 1.1822660 TRUE 12.1205231
## may 0.207970960 166.50000 0.6896552 TRUE 3.5068483
## factori 0.601680039 251.00000 0.8866995 TRUE 6.3031466
## had 0.174315394 335.66667 0.6896552 TRUE 2.4785585
## defect 0.054125873 1012.00000 0.3940887 TRUE 0.6805764
## weight.Y weight.NA
## this 4.0061123 10.5803784
## devic 4.1059479 5.6713334
## may 2.3642176 5.4087696
## factori 0.7898649 3.4865316
## had 1.0938977 0.8993777
## defect 1.3257981 1.0248679
## term weight freq pos cor.y cor.y.abs chisq.stat
## 4g 4g 1.5403968 2 9 -0.01637606 0.01637606 4.077526e-28
## gold gold 1.0488844 1 85 -0.01637606 0.01637606 4.077526e-28
## such such 0.9488844 2 193 0.08520829 0.08520829 3.427842e+00
## ipad4 ipad4 0.8740704 1 103 -0.01637606 0.01637606 4.077526e-28
## ipad1 ipad1 0.8068342 1 100 -0.01637606 0.01637606 4.077526e-28
## sprint sprint 0.7492032 1 190 -0.01637606 0.01637606 4.077526e-28
## chisq.pval nzv.freqRatio nzv.percentUnique nzv weight.N weight.Y
## 4g 1.0000000 1014.0 0.1970443 TRUE 0.6777746 0.0000000
## gold 1.0000000 1014.0 0.1970443 TRUE 1.0488844 0.0000000
## such 0.0641058 506.5 0.1970443 TRUE 0.0000000 0.9488844
## ipad4 1.0000000 1014.0 0.1970443 TRUE 0.8740704 0.0000000
## ipad1 1.0000000 1014.0 0.1970443 TRUE 0.8068342 0.0000000
## sprint 1.0000000 1014.0 0.1970443 TRUE 0.7492032 0.0000000
## weight.NA
## 4g 0.8626222
## gold 0.0000000
## such 0.0000000
## ipad4 0.0000000
## ipad1 0.0000000
## sprint 0.0000000
## term weight freq pos cor.y cor.y.abs chisq.stat
## 4g 4g 1.5403968 2 9 -0.01637606 0.01637606 4.077526e-28
## gold gold 1.0488844 1 85 -0.01637606 0.01637606 4.077526e-28
## such such 0.9488844 2 193 0.08520829 0.08520829 3.427842e+00
## ipad4 ipad4 0.8740704 1 103 -0.01637606 0.01637606 4.077526e-28
## ipad1 ipad1 0.8068342 1 100 -0.01637606 0.01637606 4.077526e-28
## sprint sprint 0.7492032 1 190 -0.01637606 0.01637606 4.077526e-28
## chisq.pval nzv.freqRatio nzv.percentUnique nzv weight.N weight.Y
## 4g 1.0000000 1014.0 0.1970443 TRUE 0.6777746 0.0000000
## gold 1.0000000 1014.0 0.1970443 TRUE 1.0488844 0.0000000
## such 0.0641058 506.5 0.1970443 TRUE 0.0000000 0.9488844
## ipad4 1.0000000 1014.0 0.1970443 TRUE 0.8740704 0.0000000
## ipad1 1.0000000 1014.0 0.1970443 TRUE 0.8068342 0.0000000
## sprint 1.0000000 1014.0 0.1970443 TRUE 0.7492032 0.0000000
## weight.NA
## 4g 0.8626222
## gold 0.0000000
## such 0.0000000
## ipad4 0.0000000
## ipad1 0.0000000
## sprint 0.0000000
## Warning in weighting(x): empty document(s): character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) charact
## [,1]
## terms.post.stop.n -0.038467757
## terms.post.stop.n.log -0.033752177
## weight.post.stop.sum -0.017806251
## terms.post.stem.n -0.037108281
## terms.post.stem.n.log -0.033160500
## weight.post.stem.sum -0.019167392
## terms.n.stem.stop.Ratio 0.061427272
## weight.sum.stem.stop.Ratio -0.003205858
## Loading required package: wordcloud
## Loading required package: RColorBrewer
## [1] " Wordcloud post-stem term weights for descr.my:"
## Warning in wordcloud(d$word, d$freq, min.freq = glb_txt_terms_control
## $bounds$global[1]): condit could not be fit on page. It will not be
## plotted.
## Warning: Removed 201 rows containing missing values (geom_text).
## label step_major step_minor label_minor bgn
## 5 extract.features_build.corpus 4 0 0 28.017
## 6 extract.features_extract.DTM 5 0 0 43.610
## end elapsed
## 5 43.61 15.593
## 6 NA NA
## [1] "Extracting term weights for descr.my..."
## Warning in weighting(x): empty document(s): character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) character(0) character(0) character(0) character(0)
## character(0) charact
## label step_major step_minor label_minor bgn
## 6 extract.features_extract.DTM 5 0 0 43.610
## 7 extract.features_report.DTM 6 0 0 44.445
## end elapsed
## 6 44.444 0.834
## 7 NA NA
## [1] "Reporting term weights for descr.my..."
## [1] " Full TermMatrix:"
## <<DocumentTermMatrix (documents: 1437, terms: 225)>>
## Non-/sparse entries: 7123/316202
## Sparsity : 98%
## Maximal term length: 10
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Warning in cor(docms_mtrx[glb_allobs_df$.src == "Train", ],
## as.numeric(glb_allobs_df[glb_allobs_df$.src == : the standard deviation is
## zero
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## [1] " Sparse TermMatrix:"
## <<DocumentTermMatrix (documents: 1437, terms: 24)>>
## Non-/sparse entries: 3358/31130
## Sparsity : 90%
## Maximal term length: 7
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in myplot_scatter(plt_terms_df, "freq", "weight", colorcol_name =
## "in.sprs"): converting in.sprs to class:factor
## Warning in rm(full_terms_mtrx): object 'full_terms_mtrx' not found
## label step_major step_minor label_minor bgn
## 7 extract.features_report.DTM 6 0 0 44.445
## 8 extract.features_bind.DTM 7 0 0 48.333
## end elapsed
## 7 48.333 3.888
## 8 NA NA
## Loading required package: tidyr
## [1] "Binding DTM for descr.my..."
## Warning in cor(docms_mtrx[glb_allobs_df$.src == "Train", ],
## as.numeric(glb_allobs_df[glb_allobs_df$.src == : the standard deviation is
## zero
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## Warning in chisq.test(docms_mtrx[glb_allobs_df$.src == "Train", ix],
## glb_allobs_df[glb_allobs_df$.src == : Chi-squared approximation may be
## incorrect
## label step_major step_minor label_minor bgn end
## 8 extract.features_bind.DTM 7 0 0 48.333 50.39
## 9 extract.features_bind.DXM 8 0 0 50.391 NA
## elapsed
## 8 2.057
## 9 NA
## [1] "Binding DXM for descr.my..."
## Warning in rm(log_X_df, txt_X_df): object 'log_X_df' not found
# Use model info provided in description
# mydsp_obs(list(description.contains="a[[:digit:]]"), cols=glb_dsp_cols, all=TRUE)
# glb_allobs_df[glb_allobs_df$UniqueID == 12474, "prdline.my"] <- "iPad mini"
# glb_allobs_df[glb_allobs_df$UniqueID == 12474, "color"] <- "Space Gray"
# glb_allobs_df[glb_allobs_df$UniqueID == 12474, "cellular"] <- "0"
# glb_allobs_df[glb_allobs_df$UniqueID == 12474, "carrier"] <- "None"
#
# mydsp_obs(list(description.contains="m(.{4})ll"), cols=glb_dsp_cols, all=TRUE)
# glb_allobs_df[glb_allobs_df$UniqueID == 11360, "color"] <- "Black"
# glb_allobs_df[glb_allobs_df$UniqueID == 11360, "storage"] <- "64"
# glb_allobs_df[glb_allobs_df$UniqueID == 11360, "cellular"] <- "0"
# glb_allobs_df[glb_allobs_df$UniqueID == 11360, "carrier"] <- "None"
#
# glb_allobs_df[glb_allobs_df$UniqueID == 11361, "prdline.my"] <- "iPad Air"
# glb_allobs_df[glb_allobs_df$UniqueID == 11361, "storage"] <- "32"
# glb_allobs_df[glb_allobs_df$UniqueID == 11361, "color"] <- "White"
# glb_allobs_df[glb_allobs_df$UniqueID == 11361, "cellular"] <- "0"
# glb_allobs_df[glb_allobs_df$UniqueID == 11361, "carrier"] <- "None"
# mydsp_obs(list(description.contains="mini(?!m)"), perl=TRUE, cols="D.P.mini", all=TRUE)
# mydsp_obs(list(D.P.mini=1), cols="D.P.mini", all=TRUE)
# mydsp_obs(list(D.P.mini=1, productline="Unknown"), cols="D.P.mini", all=TRUE)
# mydsp_obs(list(description.contains="(?<![fhp])air"), perl=TRUE, all=TRUE)
# mydsp_obs(list(description.contains="air"), perl=FALSE, cols="D.P.air", all=TRUE)
# mydsp_obs(list(D.P.air=1, productline="Unknown"), cols="D.P.air", all=TRUE)
# print(mycreate_sqlxtab_df(glb_allobs_df, c("prdline.my", "productline", "D.P.mini",
# glb_rsp_var)))
# print(glb_allobs_df[(glb_allobs_df$productline == "Unknown") &
# (glb_allobs_df$D.P.mini > 0),
# c(glb_id_var, glb_category_var, glb_dsp_cols, glbFeatsText)])
# glb_allobs_df[(glb_allobs_df$D.P.mini == 1) & (glb_allobs_df$productline == "Unknown"),
# "prdline.my"] <- "iPad mini"
# print(mycreate_sqlxtab_df(glb_allobs_df, c("prdline.my", "productline", "D.P.air",
# glb_rsp_var)))
# print(glb_allobs_df[(glb_allobs_df$productline == "Unknown") &
# (glb_allobs_df$D.P.air > 0),
# c(glb_id_var, glb_category_var, glb_dsp_cols, glbFeatsText)])
# #glb_allobs_df[glb_allobs_df$UniqueID == 11863, "D.P.air"] <- 0
# glb_allobs_df[(glb_allobs_df$D.P.air == 1) & (glb_allobs_df$productline == "Unknown"),
# "prdline.my"] <- "iPad Air"
# print(glb_allobs_df[(glb_allobs_df$UniqueID %in% c(11767, 11811, 12156)),
# c(glb_id_var, "sold",
# "prdline.my", "color", "condition", "cellular", "carrier", "storage"
# #, "descr.my"
# )])
# glb_allobs_df[glb_allobs_df$UniqueID == 11767, "prdline.my"] <- "iPad 2"
# glb_allobs_df[glb_allobs_df$UniqueID == 11767, "storage"] <- "32"
# glb_allobs_df[glb_allobs_df$UniqueID == 11811, "prdline.my"] <- "iPad 2"
# glb_allobs_df[glb_allobs_df$UniqueID == 12156, "prdline.my"] <- "iPad 1"
# mydsp_obs(list(prdline.my="Unknown"), all=TRUE)
# tmp_allobs_df <- glb_allobs_df[, "prdline.my", FALSE]
# names(tmp_allobs_df) <- "old.prdline.my"
# glb_allobs_df$prdline.my <-
# plyr::revalue(glb_allobs_df$prdline.my, c(
# # "iPad 1" = "iPad",
# # "iPad 2" = "iPad2+",
# "iPad 3" = "iPad 3+",
# "iPad 4" = "iPad 3+",
# "iPad 5" = "iPad 3+",
#
# "iPad Air" = "iPadAir",
# "iPad Air 2" = "iPadAir",
#
# "iPad mini" = "iPadmini",
# "iPad mini 2" = "iPadmini 2+",
# "iPad mini 3" = "iPadmini 2+",
# "iPad mini Retina" = "iPadmini 2+"
# ))
# tmp_allobs_df$prdline.my <- glb_allobs_df[, "prdline.my"]
# print(mycreate_sqlxtab_df(tmp_allobs_df, c("prdline.my", "old.prdline.my")))
# print(mycreate_sqlxtab_df(tmp_allobs_df, c("prdline.my")))
# print(mycreate_sqlxtab_df(subset(glb_allobs_df, color == "Unknown"),
# c("color", "D.P.black", "D.P.gold", "D.P.spacegray", "D.P.white")))
# print(glb_allobs_df[(glb_allobs_df$color == "Unknown") & (glb_allobs_df$D.P.black > 0),
# c(glb_id_var, "color", "D.P.black", "sold", "prdline.my", "condition",
# "cellular", "carrier", "storage", "descr.my")])
# glb_allobs_df[glb_allobs_df$UniqueID == 12137, "color"] <- "Black"
# print(glb_allobs_df[(glb_allobs_df$color == "Unknown") & (glb_allobs_df$D.P.spacegray > 0),
# c(glb_id_var, "color", "D.P.spacegray", "prdline.my", "condition",
# "cellular", "carrier", "storage", "descr.my")])
# glb_allobs_df[glb_allobs_df$UniqueID %in% c(12106), "color"] <- "Space Gray"
# print(glb_allobs_df[(glb_allobs_df$color == "Unknown") & (glb_allobs_df$D.P.white > 0),
# c(glb_id_var, "color", "D.P.white", "prdline.my", "condition",
# "cellular", "carrier", "storage", "descr.my")])
# glb_allobs_df[glb_allobs_df$UniqueID %in% c(10573, 10809, 10925, 11735), "color"] <-
# "White"
glb_allobs_df$carrier.fctr <- as.factor(glb_allobs_df$carrier)
glb_allobs_df$cellular.fctr <- as.factor(glb_allobs_df$cellular)
glb_allobs_df$color.fctr <- as.factor(glb_allobs_df$color)
# glb_allobs_df$prdline.my.fctr <- as.factor(glb_allobs_df$prdline.my)
glb_allobs_df$storage.fctr <- as.factor(glb_allobs_df$storage)
#stop(here"); sav_allobs_df <- glb_allobs_df; glb_allobs_df <- sav_allobs_df
# glb_allobs_df %>%
# unite(prdl.descr.my, c(productline, as.numeric(D.chrs.n.log > 0), sep="#"))
# unite_("prdl.descr.my", interp(~c("productline", as.numeric(D.chrs.n.log > 0), sep="#")))
# print(sapply(names(glb_trnobs_df), function(col) sum(is.na(glb_trnobs_df[, col]))))
# print(sapply(names(glb_newobs_df), function(col) sum(is.na(glb_newobs_df[, col]))))
# print(myplot_scatter(glb_trnobs_df, "<col1_name>", "<col2_name>", smooth=TRUE))
#stop(here"); glb_to_sav(); glb_allobs_df <- sav_allobs_df
if (!is.null(glbFeatsPrice)) {
for (var in glbFeatsPrice) {
for (digit in 1:(log10(max(glb_allobs_df[, var], na.rm=TRUE)) + 1)) {
glb_allobs_df[, paste0(var, ".dgt", digit, ".is9")] <-
as.numeric(as.integer((as.integer(glb_allobs_df[, var]) %% (10 ^ digit)) /
(10 ^ (digit - 1))) == 9)
# glb_allobs_df[, paste0(var, ".dgt", digit, ".is9.fctr")] <-
# as.factor(as.integer((as.integer(glb_allobs_df[, var]) %% (10 ^ digit)) /
# (10 ^ (digit - 1))) == 9)
}
for (decimal in 1:2) {
glb_allobs_df[, paste0(var, ".dcm", decimal, ".is9")] <-
as.numeric(as.integer(glb_allobs_df[, var] * (10 ^ decimal)) %% 10 == 9)
# glb_allobs_df[, paste0(var, ".dcm", decimal, ".is9.fctr")] <-
# as.factor(as.integer(glb_allobs_df[, var] * (10 ^ decimal)) %% 10 == 9)
}
}
#as.numeric((as.integer(startprice) %% 10) == 9)
}
rm(corpus_lst
, glb_sprs_DTM_lst #, glb_full_DTM_lst
, txt_corpus, txt_vctr)
## Warning in rm(corpus_lst, glb_sprs_DTM_lst, txt_corpus, txt_vctr): object
## 'corpus_lst' not found
extract.features_chunk_df <- myadd_chunk(extract.features_chunk_df, "extract.features_end",
major.inc=TRUE)
## label step_major step_minor label_minor bgn
## 9 extract.features_bind.DXM 8 0 0 50.391
## 10 extract.features_end 9 0 0 149.320
## end elapsed
## 9 149.319 98.928
## 10 NA NA
myplt_chunk(extract.features_chunk_df)
## label step_major
## 9 extract.features_bind.DXM 8
## 5 extract.features_build.corpus 4
## 7 extract.features_report.DTM 6
## 8 extract.features_bind.DTM 7
## 3 extract.features_process.text 3
## 6 extract.features_extract.DTM 5
## 2 extract.features_factorize.str.vars 2
## 1 extract.features_bgn 1
## 4 extract.features_process.text_reporting_compound_terms 3
## step_minor label_minor bgn end elapsed duration
## 9 0 0 50.391 149.319 98.928 98.928
## 5 0 0 28.017 43.610 15.593 15.593
## 7 0 0 44.445 48.333 3.888 3.888
## 8 0 0 48.333 50.390 2.057 2.057
## 3 0 0 27.004 28.011 1.007 1.007
## 6 0 0 43.610 44.444 0.834 0.834
## 2 0 0 26.927 27.004 0.077 0.077
## 1 0 0 26.911 26.926 0.016 0.015
## 4 1 1 28.011 28.016 0.005 0.005
## [1] "Total Elapsed Time: 149.319 secs"
# if (glb_save_envir)
# save(glb_feats_df,
# glb_allobs_df, #glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
# file=paste0(glb_out_pfx, "extract_features_dsk.RData"))
# load(paste0(glb_out_pfx, "extract_features_dsk.RData"))
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.training.all","data.new")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
#glb_chunks_df <- myadd_chunk(glb_chunks_df, "manage.missing.data", major.inc=TRUE)
3.0: extract featuresglb_chunks_df <- myadd_chunk(glb_chunks_df, "manage.missing.data", major.inc=FALSE)
## label step_major step_minor label_minor bgn end
## 5 extract.features 3 0 0 26.903 154.461
## 6 manage.missing.data 3 1 1 154.462 NA
## elapsed
## 5 127.559
## 6 NA
# If mice crashes with error: Error in get(as.character(FUN), mode = "function", envir = envir) : object 'State' of mode 'function' was not found
# consider excluding 'State' as a feature
# print(sapply(names(glb_trnobs_df), function(col) sum(is.na(glb_trnobs_df[, col]))))
# print(sapply(names(glb_newobs_df), function(col) sum(is.na(glb_newobs_df[, col]))))
# glb_trnobs_df <- na.omit(glb_trnobs_df)
# glb_newobs_df <- na.omit(glb_newobs_df)
# df[is.na(df)] <- 0
mycheck_problem_data(glb_allobs_df, featsExclude = glbFeatsExclude,
fctrMaxUniqVals = glbFctrMaxUniqVals)
## [1] "numeric data missing in : "
## sold sold.fctr
## 422 422
## [1] "numeric data w/ 0s in : "
## biddable sold
## 1437 798
## cellular.fctr D.terms.post.stop.n
## 801 762
## D.terms.post.stop.n.log D.weight.post.stop.sum
## 762 762
## D.terms.post.stem.n D.terms.post.stem.n.log
## 762 762
## D.weight.post.stem.sum D.T.condit
## 762 1104
## D.T.is D.T.has
## 1220 1340
## D.T.or D.T.this
## 1350 1309
## D.T.it D.T.of
## 1352 1346
## D.T.some D.T.have
## 1381 1377
## D.T.devic D.T.for.
## 1369 1400
## D.T.wear D.T.but
## 1371 1382
## D.T.been D.T.broken
## 1390 1424
## D.T.will D.T.from
## 1413 1392
## D.T.damag D.T.sign
## 1420 1392
## D.T.list D.T.cover
## 1398 1423
## D.T.part D.T.tear
## 1424 1414
## D.T.smart D.wrds.n.log
## 1429 760
## D.wrds.unq.n.log D.weight.sum
## 762 762
## D.ratio.weight.sum.wrds.n D.chrs.n.log
## 762 760
## D.chrs.uppr.n.log D.dgts.n.log
## 762 1293
## D.chrs.pnct01.n.log D.chrs.pnct02.n.log
## 1397 1437
## D.chrs.pnct03.n.log D.chrs.pnct04.n.log
## 1436 1437
## D.chrs.pnct05.n.log D.chrs.pnct06.n.log
## 1378 1390
## D.chrs.pnct07.n.log D.chrs.pnct08.n.log
## 1420 1378
## D.chrs.pnct09.n.log D.chrs.pnct10.n.log
## 1422 1428
## D.chrs.pnct11.n.log D.chrs.pnct12.n.log
## 1234 1369
## D.chrs.pnct13.n.log D.chrs.pnct14.n.log
## 1007 1383
## D.chrs.pnct15.n.log D.chrs.pnct16.n.log
## 1423 1432
## D.chrs.pnct17.n.log D.chrs.pnct18.n.log
## 1436 1436
## D.chrs.pnct19.n.log D.chrs.pnct20.n.log
## 1437 1437
## D.chrs.pnct21.n.log D.chrs.pnct22.n.log
## 1437 1437
## D.chrs.pnct23.n.log D.chrs.pnct24.n.log
## 1437 1437
## D.chrs.pnct25.n.log D.chrs.pnct26.n.log
## 1437 1437
## D.chrs.pnct27.n.log D.chrs.pnct28.n.log
## 1437 1429
## D.chrs.pnct29.n.log D.chrs.pnct30.n.log
## 1437 1437
## D.wrds.stop.n.log D.P.http
## 1042 1437
## D.P.mini D.P.air
## 1418 1425
## D.P.black D.P.white
## 1432 1432
## D.P.gold D.P.spacegray
## 1436 1435
## startprice.dgt1.is9 startprice.dgt2.is9
## 871 1226
## startprice.dgt3.is9 startprice.dcm1.is9
## 1432 878
## startprice.dcm2.is9
## 996
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## description condition cellular carrier color storage
## 760 0 0 0 0 0
## productline .grpid descr.my
## 0 NA 760
# glb_allobs_df <- na.omit(glb_allobs_df)
# Not refactored into mydsutils.R since glb_*_df might be reassigned
glb_impute_missing_data <- function() {
require(mice)
set.seed(glb_mice_complete.seed)
inp_impent_df <- glb_allobs_df[, setdiff(names(glb_allobs_df),
union(glbFeatsExclude, glb_rsp_var))]
print("Summary before imputation: ")
print(summary(inp_impent_df))
out_impent_df <- complete(mice(inp_impent_df))
print(summary(out_impent_df))
ret_vars <- sapply(names(out_impent_df),
function(col) ifelse(!identical(out_impent_df[, col],
inp_impent_df[, col]),
col, ""))
ret_vars <- ret_vars[ret_vars != ""]
# complete(mice()) changes attributes of factors even though values don't change
for (col in ret_vars) {
if (inherits(out_impent_df[, col], "factor")) {
if (identical(as.numeric(out_impent_df[, col]),
as.numeric(inp_impent_df[, col])))
ret_vars <- setdiff(ret_vars, col)
}
}
return(out_impent_df[, ret_vars])
}
if (glb_impute_na_data &&
(length(myfind_numerics_missing(glb_allobs_df)) > 0) &&
(ncol(nonna_df <- glb_impute_missing_data()) > 0)) {
for (col in names(nonna_df)) {
glb_allobs_df[, paste0(col, ".nonNA")] <- nonna_df[, col]
glbFeatsExclude <- c(glbFeatsExclude, col)
}
}
mycheck_problem_data(glb_allobs_df, featsExclude = glbFeatsExclude,
fctrMaxUniqVals = glbFctrMaxUniqVals, terminate = TRUE)
## [1] "numeric data missing in : "
## sold sold.fctr
## 422 422
## [1] "numeric data w/ 0s in : "
## biddable sold
## 1437 798
## cellular.fctr D.terms.post.stop.n
## 801 762
## D.terms.post.stop.n.log D.weight.post.stop.sum
## 762 762
## D.terms.post.stem.n D.terms.post.stem.n.log
## 762 762
## D.weight.post.stem.sum D.T.condit
## 762 1104
## D.T.is D.T.has
## 1220 1340
## D.T.or D.T.this
## 1350 1309
## D.T.it D.T.of
## 1352 1346
## D.T.some D.T.have
## 1381 1377
## D.T.devic D.T.for.
## 1369 1400
## D.T.wear D.T.but
## 1371 1382
## D.T.been D.T.broken
## 1390 1424
## D.T.will D.T.from
## 1413 1392
## D.T.damag D.T.sign
## 1420 1392
## D.T.list D.T.cover
## 1398 1423
## D.T.part D.T.tear
## 1424 1414
## D.T.smart D.wrds.n.log
## 1429 760
## D.wrds.unq.n.log D.weight.sum
## 762 762
## D.ratio.weight.sum.wrds.n D.chrs.n.log
## 762 760
## D.chrs.uppr.n.log D.dgts.n.log
## 762 1293
## D.chrs.pnct01.n.log D.chrs.pnct02.n.log
## 1397 1437
## D.chrs.pnct03.n.log D.chrs.pnct04.n.log
## 1436 1437
## D.chrs.pnct05.n.log D.chrs.pnct06.n.log
## 1378 1390
## D.chrs.pnct07.n.log D.chrs.pnct08.n.log
## 1420 1378
## D.chrs.pnct09.n.log D.chrs.pnct10.n.log
## 1422 1428
## D.chrs.pnct11.n.log D.chrs.pnct12.n.log
## 1234 1369
## D.chrs.pnct13.n.log D.chrs.pnct14.n.log
## 1007 1383
## D.chrs.pnct15.n.log D.chrs.pnct16.n.log
## 1423 1432
## D.chrs.pnct17.n.log D.chrs.pnct18.n.log
## 1436 1436
## D.chrs.pnct19.n.log D.chrs.pnct20.n.log
## 1437 1437
## D.chrs.pnct21.n.log D.chrs.pnct22.n.log
## 1437 1437
## D.chrs.pnct23.n.log D.chrs.pnct24.n.log
## 1437 1437
## D.chrs.pnct25.n.log D.chrs.pnct26.n.log
## 1437 1437
## D.chrs.pnct27.n.log D.chrs.pnct28.n.log
## 1437 1429
## D.chrs.pnct29.n.log D.chrs.pnct30.n.log
## 1437 1437
## D.wrds.stop.n.log D.P.http
## 1042 1437
## D.P.mini D.P.air
## 1418 1425
## D.P.black D.P.white
## 1432 1432
## D.P.gold D.P.spacegray
## 1436 1435
## startprice.dgt1.is9 startprice.dgt2.is9
## 871 1226
## startprice.dgt3.is9 startprice.dcm1.is9
## 1432 878
## startprice.dcm2.is9
## 996
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## description condition cellular carrier color storage
## 760 0 0 0 0 0
## productline .grpid descr.my
## 0 NA 760
3.1: manage missing dataglb_chunks_df <- myadd_chunk(glb_chunks_df, "cluster.data", major.inc=FALSE)
## label step_major step_minor label_minor bgn end
## 6 manage.missing.data 3 1 1 154.462 154.625
## 7 cluster.data 3 2 2 154.626 NA
## elapsed
## 6 0.163
## 7 NA
mycompute_entropy_df <- function(obs_df, entropy_var, by_var=NULL) {
require(lazyeval)
require(dplyr)
require(tidyr)
if (is.null(by_var)) {
by_var <- ".default"
obs_df$.default <- as.factor(".default")
}
if (!any(grepl(".clusterid", names(obs_df), fixed=TRUE)))
obs_df$.clusterid <- 1
cluster_df <- obs_df %>%
count_(c(by_var, ".clusterid", entropy_var)) %>%
dplyr::filter(n > 0) %>%
dplyr::filter_(interp(~(!is.na(var)), var=as.name(entropy_var))) %>%
unite_(paste0(by_var, ".clusterid"),
c(interp(by_var), ".clusterid")) %>%
spread_(interp(entropy_var), "n", fill=0)
# head(cluster_df)
# sum(cluster_df$n)
tmp.entropy <- sapply(1:nrow(cluster_df),
function(row) entropy(as.numeric(cluster_df[row, -1]), method = "ML"))
tmp.knt <- sapply(1:nrow(cluster_df),
function(row) sum(as.numeric(cluster_df[row, -1])))
cluster_df$.entropy <- tmp.entropy; cluster_df$.knt <- tmp.knt
#print(cluster_df)
return(cluster_df)
}
if (glb_cluster) {
require(proxy)
#require(hash)
require(dynamicTreeCut)
require(entropy)
require(tidyr)
require(ggdendro)
mywgtdcosine_dist <- function(x, y=NULL, weights=NULL) {
if (!inherits(x, "matrix"))
x <- as.matrix(x)
if (is.null(weights))
weights <- rep(1, ncol(x))
wgtsx <- matrix(rep(weights / sum(weights), nrow(x)), nrow = nrow(x),
byrow = TRUE)
wgtdx <- x * wgtsx
wgtdxsqsum <- as.matrix(rowSums((x ^ 2) * wgtsx), byrow=FALSE)
denom <- sqrt(wgtdxsqsum %*% t(wgtdxsqsum))
ret_mtrx <- 1 - ((sum(weights) ^ 1) * (wgtdx %*% t(wgtdx)) / denom)
ret_mtrx[is.nan(ret_mtrx)] <- 1
diag(ret_mtrx) <- 0
return(ret_mtrx)
}
#pr_DB$delete_entry("mywgtdcosine");
# Need to do this only once across runs ?
if (!pr_DB$entry_exists("mywgtdcosine")) {
pr_DB$set_entry(FUN = mywgtdcosine_dist, names = c("mywgtdcosine"))
pr_DB$modify_entry(names="mywgtdcosine", type="metric", loop=FALSE)
}
#pr_DB$get_entry("mywgtdcosine")
# glb_hash <- hash(key=unique(glb_allobs_df$myCategory),
# values=1:length(unique(glb_allobs_df$myCategory)))
# glb_hash_lst <- hash(key=unique(glb_allobs_df$myCategory),
# values=1:length(unique(glb_allobs_df$myCategory)))
#stop(here"); glb_to_sav(); glb_allobs_df <- sav_allobs_df
cluster_vars <- grep(paste0("[",
toupper(paste0(substr(glbFeatsText, 1, 1), collapse = "")),
"]\\.[PT]\\."),
names(glb_allobs_df), value = TRUE)
# Assign correlations with rsp_var as weights for cosine distance
print("Clustering features: ")
cluster_vars_df <- data.frame(abs.cor.y = abs(cor(
glb_allobs_df[glb_allobs_df$.src == "Train", cluster_vars],
as.numeric(glb_allobs_df[glb_allobs_df$.src == "Train", glb_rsp_var]),
use = "pairwise.complete.obs")))
print(tail(cluster_vars_df <- orderBy(~ abs.cor.y,
subset(cluster_vars_df, !is.na(abs.cor.y))), 5))
print(sprintf(" .rnorm cor: %0.4f",
cor(glb_allobs_df[glb_allobs_df$.src == "Train", ".rnorm"],
as.numeric(glb_allobs_df[glb_allobs_df$.src == "Train", glb_rsp_var]),
use = "pairwise.complete.obs")))
print(sprintf("glb_allobs_df Entropy: %0.4f",
allobs_ent <- entropy(table(glb_allobs_df[, glb_cluster_entropy_var]),
method="ML")))
print(category_df <- mycompute_entropy_df(obs_df=glb_allobs_df,
entropy_var=glb_cluster_entropy_var,
by_var=glb_category_var))
print(sprintf("glb_allobs_df$%s Entropy: %0.4f (%0.4f pct)",
glb_category_var,
category_ent <- weighted.mean(category_df$.entropy, category_df$.knt),
100 * category_ent / allobs_ent))
glb_allobs_df$.clusterid <- 1
#print(max(table(glb_allobs_df$myCategory.fctr) / 20))
#stop(here"); glb_to_sav()
grp_ids <- sort(unique(glb_allobs_df[, glb_category_var]))
glb_cluster_size_df_lst <- list()
png(paste0(glb_out_pfx, "FeatsTxtClusters.png"),
width = 480 * 2, height = 480 * length(grp_ids))
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow = length(grp_ids), ncol = 2)))
pltIx <- 1
for (grp in grp_ids) {
# if (grep(grp, levels(grp_ids)) <= 6) next
# if (grep(grp, levels(grp_ids)) > 9) next
# if (grep(grp, levels(grp_ids)) != 10) next
print(sprintf("Category: %s", grp))
ctgry_allobs_df <- glb_allobs_df[glb_allobs_df[, glb_category_var] == grp, ]
if (!inherits(ctgry_allobs_df[, glb_cluster_entropy_var], "factor"))
ctgry_allobs_df[, glb_cluster_entropy_var] <-
as.factor(ctgry_allobs_df[, glb_cluster_entropy_var])
#dstns_dist <- proxy::dist(ctgry_allobs_df[, cluster_vars], method = "cosine")
dstns_dist <- proxy::dist(ctgry_allobs_df[, row.names(cluster_vars_df)],
method = "mywgtdcosine",
weights = cluster_vars_df$abs.cor.y)
# Custom distance functions return a crossdist object
#dstns_mtrx <- as.matrix(dstns_dist)
dstns_mtrx <- matrix(as.vector(dstns_dist), nrow=attr(dstns_dist, "dim")[1],
dimnames=attr(dstns_dist, "dimnames"))
dstns_dist <- as.dist(dstns_mtrx)
print(sprintf("max distance(%0.4f) pair:", max(dstns_mtrx)))
# print(dim(dstns_mtrx))
# print(sprintf("which.max: %d", which.max(dstns_mtrx)))
row_ix <- ceiling(which.max(dstns_mtrx) / ncol(dstns_mtrx))
col_ix <- which.max(dstns_mtrx[row_ix, ])
# print(sprintf("row_ix: %d", row_ix)); print(sprintf("col_ix: %d", col_ix));
# print(dim(ctgry_allobs_df))
print(ctgry_allobs_df[c(row_ix, col_ix),
c(glb_id_var, glb_cluster_entropy_var, glb_category_var, glbFeatsText, cluster_vars)])
min_dstns_mtrx <- dstns_mtrx
diag(min_dstns_mtrx) <- 1
# Float representations issue -2.22e-16 vs. 0.0000
print(sprintf("min distance(%0.4f) pair:", min(min_dstns_mtrx)))
row_ix <- ceiling(which.min(min_dstns_mtrx) / ncol(min_dstns_mtrx))
col_ix <- which.min(min_dstns_mtrx[row_ix, ])
print(ctgry_allobs_df[c(row_ix, col_ix),
c(glb_id_var, glb_cluster_entropy_var, glb_category_var, glbFeatsText,
cluster_vars)])
set.seed(glb_cluster.seed)
clusters <- hclust(dstns_dist, method = "ward.D2")
# Workaround to avoid "Error in cutree(dendro, h = heightcutoff) : the 'height' component of 'tree' is not sorted (increasingly)"
if (with(clusters,all.equal(height,sort(height))))
clusters$height <- round(clusters$height,6)
clusters$labels <- ctgry_allobs_df[, glb_id_var]
clustersDD <- dendro_data(clusters)
clustersDD$labels[, glb_rsp_var] <- sapply(clustersDD$labels$label, function(id)
ctgry_allobs_df[id == ctgry_allobs_df[, glb_id_var], glb_rsp_var])
print(ggdendrogram(clustersDD, rotate = TRUE, size = 2) +
geom_point(data = clustersDD$labels,
aes_string(x = "x", color = glb_rsp_var), y = min(clustersDD$segments$y)) +
coord_flip(ylim = c(min(clustersDD$segments$y),
max(clustersDD$segments$y))) +
ggtitle(grp),
vp = viewport(layout.pos.row = pltIx, layout.pos.col = 1))
# clusters$labels <- ctgry_allobs_df[, glb_id_var]
# clustersDD <- dendro_data(clusters)
# clustersDD$labels$color <- sapply(clustersDD$labels$label, function(id)
# ctgry_allobs_df[id == ctgry_allobs_df[, glb_id_var], glb_rsp_var])
# print(ggdendrogram(clustersDD, rotate = TRUE, size = 2) +
# geom_point(data = clustersDD$labels,
# aes_string(x = "x", color = "color"), y = min(clustersDD$segments$y)) +
# coord_flip(ylim = c(min(clustersDD$segments$y),
# max(clustersDD$segments$y))))
# print(ggdendrogram(clustersDD, rotate = TRUE, size = 2) +
# geom_point(data = clustersDD$labels,
# aes_string(x = "x", y = "y", color = "color")))
# myplclust(clusters, lab=ctgry_allobs_df[, glb_id_var],
# lab.col=unclass(ctgry_allobs_df[, glb_cluster_entropy_var]))
opt_minclustersize_df <- data.frame(minclustersize = nrow(ctgry_allobs_df),
entropy = entropy(table(ctgry_allobs_df[, glb_cluster_entropy_var]),
method = "ML"))
for (minclustersize in
as.integer(seq(nrow(ctgry_allobs_df) / 2, nrow(ctgry_allobs_df) / 10,
length = 5))) {
clusterGroups <- cutreeDynamic(clusters, minClusterSize = minclustersize,
method = "tree", deepSplit = 0)
# Unassigned groups are labeled 0; the largest group has label 1
clusterGroups[clusterGroups == 0] <- 1
ctgry_allobs_df$.clusterid <- clusterGroups
ctgry_clstrs_df <- mycompute_entropy_df(ctgry_allobs_df,
glb_cluster_entropy_var)
opt_minclustersize_df <- rbind(opt_minclustersize_df,
data.frame(minclustersize = minclustersize,
entropy = weighted.mean(ctgry_clstrs_df$.entropy, ctgry_clstrs_df$.knt)))
}
opt_minclustersize <-
opt_minclustersize_df$minclustersize[which.min(opt_minclustersize_df$entropy)]
opt_minclustersize_df$.color <-
ifelse(opt_minclustersize_df$minclustersize == opt_minclustersize,
"red", "blue")
print(ggplot(data = opt_minclustersize_df,
mapping = aes(x = minclustersize, y = entropy)) +
geom_point(aes(color = .color)) + scale_color_identity() +
guides(color = "none") + geom_line(),
vp = viewport(layout.pos.row = pltIx, layout.pos.col = 2))
glb_cluster_size_df_lst[[grp]] <- opt_minclustersize_df
# select minclustersize that minimizes entropy
clusterGroups <- cutreeDynamic(clusters, minClusterSize = opt_minclustersize,
method = "tree",
deepSplit = 0)
# Unassigned groups are labeled 0; the largest group has label 1
table(clusterGroups, ctgry_allobs_df[, glb_cluster_entropy_var],
useNA = "ifany")
clusterGroups[clusterGroups == 0] <- 1
table(clusterGroups, ctgry_allobs_df[, glb_cluster_entropy_var], useNA = "ifany")
glb_allobs_df[glb_allobs_df[, glb_category_var] == grp,]$.clusterid <-
clusterGroups
pltIx <- pltIx + 1
}
dev.off()
#all.equal(sav_allobs_df_clusterid, glb_allobs_df$.clusterid)
print(cluster_df <- mycompute_entropy_df(obs_df=glb_allobs_df,
entropy_var=glb_cluster_entropy_var,
by_var=glb_category_var))
print(sprintf("glb_allobs_df$%s$.clusterid Entropy: %0.4f (%0.4f pct)",
glb_category_var,
cluster_ent <- weighted.mean(cluster_df$.entropy, cluster_df$.knt),
100 * cluster_ent / category_ent))
glb_allobs_df$.clusterid.fctr <- as.factor(glb_allobs_df$.clusterid)
# .clusterid.fctr is created automatically (probably ?) later
glbFeatsExclude <- c(glbFeatsExclude, ".clusterid")
if (!is.null(glb_category_var))
# glb_interaction_only_feats_lst[ifelse(grepl("\\.fctr", glb_category_var),
# glb_category_var,
# paste0(glb_category_var, ".fctr"))] <-
# c(".clusterid.fctr")
glb_interaction_only_feats_lst[[".clusterid.fctr"]] <-
ifelse(grepl("\\.fctr", glb_category_var), glb_category_var,
paste0(glb_category_var, ".fctr"))
if (glbFeatsTextClusterVarsExclude)
glbFeatsExclude <- c(glbFeatsExclude, cluster_vars)
}
## Loading required package: proxy
##
## Attaching package: 'proxy'
##
## The following objects are masked from 'package:stats':
##
## as.dist, dist
##
## The following object is masked from 'package:base':
##
## as.matrix
##
## Loading required package: dynamicTreeCut
## Loading required package: entropy
## Loading required package: ggdendro
## [1] "Clustering features: "
## Warning in cor(glb_allobs_df[glb_allobs_df$.src == "Train",
## cluster_vars], : the standard deviation is zero
## abs.cor.y
## D.T.has 0.08174596
## D.T.cover 0.08403793
## D.T.it 0.09036475
## D.T.broken 0.09137980
## D.T.of 0.09613121
## [1] " .rnorm cor: 0.0138"
## [1] "glb_allobs_df Entropy: 0.5189"
## Loading required package: lazyeval
## Source: local data frame [20 x 5]
##
## prdl.descr.my.fctr.clusterid N Y .entropy .knt
## (chr) (dbl) (dbl) (dbl) (dbl)
## 1 Unknown#0_1 52 15 0.5317758 67
## 2 Unknown#1_1 42 11 0.5106887 53
## 3 iPad1#0_1 41 14 0.5672740 55
## 4 iPad1#1_1 37 15 0.6007684 52
## 5 iPad2#0_1 41 13 0.5519323 54
## 6 iPad2#1_1 73 19 0.5093116 92
## 7 iPad3#0_1 16 3 0.4361623 19
## 8 iPad3#1_1 43 14 0.5574652 57
## 9 iPad4#0_1 24 11 0.6224869 35
## 10 iPad4#1_1 50 8 0.4011899 58
## 11 iPadAir#0_1 41 12 0.5349082 53
## 12 iPadAir#1_1 48 5 0.3124648 53
## 13 iPadAir2#0_1 64 15 0.4860394 79
## 14 iPadAir2#1_1 21 5 0.4895519 26
## 15 iPadmini#0_1 71 20 0.5266297 91
## 16 iPadmini#1_1 37 13 0.5730569 50
## 17 iPadmini2#0_1 30 11 0.5815533 41
## 18 iPadmini2#1_1 16 4 0.5004024 20
## 19 iPadmini3#0_1 45 8 0.4243420 53
## 20 iPadmini3#1_1 6 1 0.4101163 7
## [1] "glb_allobs_df$prdl.descr.my.fctr Entropy: 0.5111 (98.4844 pct)"
## [1] "Category: Unknown#0"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 18 10024 N Unknown#0 0 0
## 53 10066 N Unknown#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 18 0 0 0 0 0 0 0 0
## 53 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 18 0 0 0 0 0 0 0
## 53 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 18 0 0 0 0 0 0 0
## 53 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 18 0 0 0 0 0 0 0
## 53 0 0 0 0 0 0 0
## [1] "min distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 18 10024 N Unknown#0 0 0
## 18.1 10024 N Unknown#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 18 0 0 0 0 0 0 0 0
## 18.1 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 18 0 0 0 0 0 0 0
## 18.1 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 18 0 0 0 0 0 0 0
## 18.1 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 18 0 0 0 0 0 0 0
## 18.1 0 0 0 0 0 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "Category: Unknown#1"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 4 10005 N Unknown#1
## 141 10244 N Unknown#1
## descr.my
## 4 Please feel free to buy. All products have been thoroughly inspected, cleaned and tested to be 100%
## 141 Sync/Charge cable included. Unit is in perfect working order with only minimal scuffs. No earbuds
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some
## 4 0 0.0000000 0 0 0 0 0 0
## 141 0 0.1948067 0 0 0 0 0 0
## D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been D.T.broken
## 4 0.3818295 0 0 0 0 0.411188 0
## 141 0.0000000 0 0 0 0 0.000000 0
## D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover D.T.part
## 4 0 0 0 0 0 0 0
## 141 0 0 0 0 0 0 0
## D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black D.P.white
## 4 0 0 0 0 0 0 0
## 141 0 0 0 0 0 0 0
## D.P.gold D.P.spacegray
## 4 0 0
## 141 0 0
## [1] "min distance(0.9073) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 231 10557 Y Unknown#1
## 1072 11948 <NA> Unknown#1
## descr.my
## 231 condition is 9 out of 10
## 1072 Please scroll all the way down to the bottom of the page and read the entire page thoroughly BEFORE
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of
## 231 0.4218932 0.5454586 0 0 0 0 0.7962099
## 1072 0.0000000 0.0000000 0 0 0 0 0.3981050
## D.T.some D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been
## 231 0 0 0 0 0 0 0
## 1072 0 0 0 0 0 0 0
## D.T.broken D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover
## 231 0 0 0 0 0 0 0
## 1072 0 0 0 0 0 0 0
## D.T.part D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black
## 231 0 0 0 0 0 0 0
## 1072 0 0 0 0 0 0 0
## D.P.white D.P.gold D.P.spacegray
## 231 0 0 0
## 1072 0 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "Category: iPad1#0"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is D.T.has
## 7 10012 N iPad1#0 0 0 0
## 8 10014 Y iPad1#0 0 0 0
## D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic D.T.for.
## 7 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0
## D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from D.T.damag
## 7 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0
## D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart D.P.http
## 7 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0
## D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 7 0 0 0 0 0 0
## 8 0 0 0 0 0 0
## [1] "min distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 7 10012 N iPad1#0 0 0
## 7.1 10012 N iPad1#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 7 0 0 0 0 0 0 0 0
## 7.1 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 7 0 0 0 0 0 0 0
## 7.1 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 7 0 0 0 0 0 0 0
## 7.1 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 7 0 0 0 0 0 0 0
## 7.1 0 0 0 0 0 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "Category: iPad1#1"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 21 10027 Y iPad1#1
## 26 10032 Y iPad1#1
## descr.my
## 21 FULLY FUNCTIONAL
## 26 In very good conditions, does show sign of use but mostly had a case on at all times. Still has a
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some
## 21 0.0000000 0 0.0000000 0 0 0 0.0000000 0
## 26 0.1110245 0 0.2046806 0 0 0 0.2095289 0
## D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been D.T.broken
## 21 0 0 0 0 0.0000000 0 0
## 26 0 0 0 0 0.2477623 0 0
## D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover D.T.part
## 21 0 0 0 0.0000000 0 0 0
## 26 0 0 0 0.2629995 0 0 0
## D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black D.P.white
## 21 0 0 0 0 0 0 0
## 26 0 0 0 0 0 0 0
## D.P.gold D.P.spacegray
## 21 0 0
## 26 0 0
## [1] "min distance(0.9222) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 432 11008 N iPad1#1
## 1014 11860 Y iPad1#1
## descr.my
## 432 Fully functional, no issues with it. Just have newer models now and want to sell. This model has the
## 1014 This unit has minor scratches on case and several small scratches on the display. It is in
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of
## 432 0 0.0000000 0.2991486 0 0.2683726 0.3138041 0
## 1014 0 0.1704558 0.2430582 0 0.2180528 0.2549658 0
## D.T.some D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been
## 432 0 0.352458 0 0 0 0 0
## 1014 0 0.000000 0 0 0 0 0
## D.T.broken D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover
## 432 0 0 0 0 0 0 0
## 1014 0 0 0 0 0 0 0
## D.T.part D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black
## 432 0 0 0 0 0 0 0
## 1014 0 0 0 0 0 0 0
## D.P.white D.P.gold D.P.spacegray
## 432 0 0 0
## 1014 0 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "Category: iPad2#0"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 32 10039 N iPad2#0 0 0
## 61 10079 N iPad2#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 32 0 0 0 0 0 0 0 0
## 61 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 32 0 0 0 0 0 0 0
## 61 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 32 0 0 0 0 0 0 0
## 61 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 32 0 0 0 0 0 0 0
## 61 0 0 0 0 0 0 0
## [1] "min distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 32 10039 N iPad2#0 0 0
## 32.1 10039 N iPad2#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 32 0 0 0 0 0 0 0 0
## 32.1 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 32 0 0 0 0 0 0 0
## 32.1 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 32 0 0 0 0 0 0 0
## 32.1 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 32 0 0 0 0 0 0 0
## 32.1 0 0 0 0 0 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "Category: iPad2#1"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 1 10001 N iPad2#1
## 15 10021 Y iPad2#1
## descr.my D.T.condit D.T.is
## 1 iPad is in 8point5+ out of 10 cosmetic condition! 0.2636832 0.3409116
## 15 Crack on screen. 0.0000000 0.0000000
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 1 0 0 0 0 0.4976312 0 0 0
## 15 0 0 0 0 0.0000000 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 1 0 0 0 0 0 0 0
## 15 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 1 0 0 0 0 0 0 0
## 15 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 1 0 0 0 0 0 0 0
## 15 0 0 0 0 0 0 0
## [1] "min distance(0.9057) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 1419 12634 <NA> iPad2#1
## 1433 12654 <NA> iPad2#1
## descr.my
## 1419 Good condition IPAD2 32gb wifi + 3g verizon. LOT OF FIVE.
## 1433 Crack on digitizer near top. Top line of digitizer doesnt respond to touch. Other than that, all
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some
## 1419 0.2636832 0 0 0 0 0 0.4976312 0
## 1433 0.0000000 0 0 0 0 0 0.3062346 0
## D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been D.T.broken
## 1419 0 0 0 0 0 0 0
## 1433 0 0 0 0 0 0 0
## D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover D.T.part
## 1419 0 0 0 0 0 0 0
## 1433 0 0 0 0 0 0 0
## D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black D.P.white
## 1419 0 0 0 0 0 0 0
## 1433 0 0 0 0 0 0 0
## D.P.gold D.P.spacegray
## 1419 0 0
## 1433 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "Category: iPad3#0"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 34 10041 Y iPad3#0 0 0
## 75 10100 N iPad3#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 34 0 0 0 0 0 0 0 0
## 75 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 34 0 0 0 0 0 0 0
## 75 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 34 0 0 0 0 0 0 0
## 75 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 34 0 0 0 0 0 0 0
## 75 0 0 0 0 0 0 0
## [1] "min distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 34 10041 Y iPad3#0 0 0
## 34.1 10041 Y iPad3#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 34 0 0 0 0 0 0 0 0
## 34.1 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 34 0 0 0 0 0 0 0
## 34.1 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 34 0 0 0 0 0 0 0
## 34.1 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 34 0 0 0 0 0 0 0
## 34.1 0 0 0 0 0 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "Category: iPad3#1"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 11 10017 Y iPad3#1
## 16 10022 N iPad3#1
## descr.my
## 11 Great working iPad. Very minor surface scratches on back as pictured. Other very light scratching
## 16 Great working iPad. Only light scratching consistent with typical wear as pictured.
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some
## 11 0 0 0 0 0 0 0 0
## 16 0 0 0 0 0 0 0 0
## D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been D.T.broken
## 11 0 0 0 0.000000 0 0 0
## 16 0 0 0 0.444445 0 0 0
## D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover D.T.part
## 11 0 0 0 0 0 0 0
## 16 0 0 0 0 0 0 0
## D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black D.P.white
## 11 0 0 0 0 0 0 0
## 16 0 0 0 0 0 0 0
## D.P.gold D.P.spacegray
## 11 0 0
## 16 0 0
## [1] "min distance(0.9097) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 998 11842 N iPad3#1
## 1005 11851 N iPad3#1
## descr.my
## 998 Item specifics: Condition: The screen got broken and it was replaced, the unit works great. Seller
## 1005 Item Specifics: Condition: USED The screen got broken and it was replaced and works great.
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some
## 998 0.1506761 0 0 0 0 0.2913895 0 0
## 1005 0.1622666 0 0 0 0 0.3138041 0 0
## D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been D.T.broken
## 998 0 0 0 0 0 0 0.484886
## 1005 0 0 0 0 0 0 0.522185
## D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover D.T.part
## 998 0 0 0 0 0 0 0
## 1005 0 0 0 0 0 0 0
## D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black D.P.white
## 998 0 0 0 0 0 0 0
## 1005 0 0 0 0 0 0 0
## D.P.gold D.P.spacegray
## 998 0 0
## 1005 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "Category: iPad4#0"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 2 10003 Y iPad4#0 0 0
## 44 10053 N iPad4#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 2 0 0 0 0 0 0 0 0
## 44 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 2 0 0 0 0 0 0 0
## 44 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 2 0 0 0 0 0 0 0
## 44 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 2 0 0 0 0 0 0 0
## 44 0 0 0 0 0 0 0
## [1] "min distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 2 10003 Y iPad4#0 0 0
## 2.1 10003 Y iPad4#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 2 0 0 0 0 0 0 0 0
## 2.1 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 2 0 0 0 0 0 0 0
## 2.1 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 2 0 0 0 0 0 0 0
## 2.1 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 2 0 0 0 0 0 0 0
## 2.1 0 0 0 0 0 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "Category: iPad4#1"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 6 10011 Y iPad4#1
## 51 10063 N iPad4#1
## descr.my
## 6 good condition, minor wear and tear on body some light scratches on screen. functions great.
## 51 *FREE* Same-Day Ship | 90-Day Warranty | 100% Functional, Includes All Major Accessories, Shows
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some
## 6 0.1506761 0 0 0 0 0 0 0.3343921
## 51 0.0000000 0 0 0 0 0 0 0.0000000
## D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been D.T.broken
## 6 0 0 0 0.3174607 0 0 0
## 51 0 0 0 0.0000000 0 0 0
## D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover D.T.part
## 6 0 0 0 0 0 0 0
## 51 0 0 0 0 0 0 0
## D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black D.P.white
## 6 0.4260916 0 0 0 0 0 0
## 51 0.0000000 0 0 0 0 0 0
## D.P.gold D.P.spacegray
## 6 0 0
## 51 0 0
## [1] "min distance(0.9084) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 23 10029 N iPad4#1
## 410 10977 N iPad4#1
## descr.my
## 23 Customer or Carrier Return. Cosmetic condition is 9point5 of 10. NOT IN ORIGINAL BOX
## 410 Out of retail box. There's a small scratch and a smudge/mark down the length of screen. Neither of
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of
## 23 0.1917696 0.2479357 0 0.3678092 0 0 0.3619136
## 410 0.0000000 0.0000000 0 0.0000000 0 0 0.8530821
## D.T.some D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been
## 23 0 0 0 0 0 0 0
## 410 0 0 0 0 0 0 0
## D.T.broken D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover
## 23 0 0 0 0 0 0 0
## 410 0 0 0 0 0 0 0
## D.T.part D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black
## 23 0 0 0 0 0 0 0
## 410 0 0 0 0 0 0 0
## D.P.white D.P.gold D.P.spacegray
## 23 0 0 0
## 410 0 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "Category: iPadAir#0"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 10 10016 N iPadAir#0 0 0
## 19 10025 N iPadAir#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 10 0 0 0 0 0 0 0 0
## 19 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 10 0 0 0 0 0 0 0
## 19 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 10 0 0 0 0 0 0 0
## 19 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 10 0 0 0 0 0 0 0
## 19 0 0 0 0 0 0 0
## [1] "min distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 10 10016 N iPadAir#0 0 0
## 10.1 10016 N iPadAir#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 10 0 0 0 0 0 0 0 0
## 10.1 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 10 0 0 0 0 0 0 0
## 10.1 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 10 0 0 0 0 0 0 0
## 10.1 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 10 0 0 0 0 0 0 0
## 10.1 0 0 0 0 0 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "Category: iPadAir#1"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 24 10030 Y iPadAir#1
## 46 10055 N iPadAir#1
## descr.my
## 24 Comes with USB Cable and wall adapter. May have minor dings or scuffs.
## 46 LikeNew
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some
## 24 0 0 0 0.3371584 0 0 0 0
## 46 0 0 0 0.0000000 0 0 0 0
## D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been D.T.broken
## 24 0.3818295 0 0 0 0 0 0
## 46 0.0000000 0 0 0 0 0 0
## D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover D.T.part
## 24 0 0 0 0 0 0 0
## 46 0 0 0 0 0 0 0
## D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black D.P.white
## 24 0 0 0 0 0 0 0
## 46 0 0 0 0 0 0 0
## D.P.gold D.P.spacegray
## 24 0 0
## 46 0 0
## [1] "min distance(0.9143) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 266 10647 N iPadAir#1
## 1180 12281 <NA> iPadAir#1
## descr.my
## 266 Has been covered with a ZaagInvisibleShield since it came out of the box and kept in a case. Front
## 1180 Has been covered with a ZaagInvisibleShield since it came out of the box and kept in a case. Front
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of
## 266 0 0 0.2287607 0 0 0.2399678 0.2341794
## 1180 0 0 0.2287607 0 0 0.2399678 0.2341794
## D.T.some D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been
## 266 0 0 0 0 0 0 0.2902503
## 1180 0 0 0 0 0 0 0.2902503
## D.T.broken D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover
## 266 0 0 0 0 0 0 0.3930288
## 1180 0 0 0 0 0 0 0.3930288
## D.T.part D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black
## 266 0 0 0 0 0 0 0
## 1180 0 0 0 0 0 0 0
## D.P.white D.P.gold D.P.spacegray
## 266 0 0 0
## 1180 0 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "Category: iPadAir2#0"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 13 10019 Y iPadAir2#0 0 0
## 28 10035 N iPadAir2#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 13 0 0 0 0 0 0 0 0
## 28 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 13 0 0 0 0 0 0 0
## 28 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 13 0 0 0 0 0 0 0
## 28 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 13 0 0 0 0 0 0 0
## 28 0 0 0 0 0 0 0
## [1] "min distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 13 10019 Y iPadAir2#0 0 0
## 13.1 10019 Y iPadAir2#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 13 0 0 0 0 0 0 0 0
## 13.1 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 13 0 0 0 0 0 0 0
## 13.1 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 13 0 0 0 0 0 0 0
## 13.1 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 13 0 0 0 0 0 0 0
## 13.1 0 0 0 0 0 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "Category: iPadAir2#1"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 29 10036 N iPadAir2#1
## 132 10219 N iPadAir2#1
## descr.my
## 29 Used only 1 month, likenew condition, include smart case and all accessories
## 132 An item that has been restored to working order by the eBay seller or a third party not approved by
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some
## 29 0.1757888 0 0.0000000 0.0000000 0 0 0 0
## 132 0.0000000 0 0.2430582 0.2528688 0 0 0 0
## D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been D.T.broken
## 29 0 0 0 0 0 0.000000 0
## 132 0 0 0 0 0 0.308391 0
## D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover D.T.part
## 29 0 0 0 0 0 0 0
## 132 0 0 0 0 0 0 0
## D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black D.P.white
## 29 0 0.6240704 0 0 0 0 0
## 132 0 0.0000000 0 0 0 0 0
## D.P.gold D.P.spacegray
## 29 0 0
## 132 0 0
## [1] "min distance(0.9183) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 148 10266 N iPadAir2#1
## 235 10563 N iPadAir2#1
## descr.my
## 148 Used. Tested Cameras, speaker, charge port, buttons and touch. Has light scratches on case. Light
## 235 Used. Tested Cameras, speaker, charge port, headphone jack, buttons and touch. Has light scratches
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some
## 148 0 0 0.3535392 0 0 0 0 0
## 235 0 0 0.4861164 0 0 0 0 0
## D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been D.T.broken
## 148 0 0 0 0 0 0 0
## 235 0 0 0 0 0 0 0
## D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover D.T.part
## 148 0 0 0 0 0 0 0
## 235 0 0 0 0 0 0 0
## D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black D.P.white
## 148 0 0 0 0 0 0 0
## 235 0 0 0 0 0 0 0
## D.P.gold D.P.spacegray
## 148 0 0
## 235 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "Category: iPadmini#0"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 47 10057 N iPadmini#0 0 0
## 52 10065 N iPadmini#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 47 0 0 0 0 0 0 0 0
## 52 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 47 0 0 0 0 0 0 0
## 52 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 47 0 0 0 0 0 0 0
## 52 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 47 0 0 0 0 0 0 0
## 52 0 0 0 0 0 0 0
## [1] "min distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 47 10057 N iPadmini#0 0 0
## 47.1 10057 N iPadmini#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 47 0 0 0 0 0 0 0 0
## 47.1 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 47 0 0 0 0 0 0 0
## 47.1 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 47 0 0 0 0 0 0 0
## 47.1 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 47 0 0 0 0 0 0 0
## 47.1 0 0 0 0 0 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "Category: iPadmini#1"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 50 10060 N iPadmini#1
## 59 10076 Y iPadmini#1
## descr.my
## 50 Minor scuffs in the back. Otherwise looks flawless. See all pictures.
## 59 Works perfectly, NOT iCloud locked, 1 owner. It is in not in very good conditions, but works
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some
## 50 0.0000000 0.000000 0 0 0 0.0000000 0 0
## 59 0.1240862 0.160429 0 0 0 0.2399678 0 0
## D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been D.T.broken
## 50 0 0 0 0 0.0000000 0 0
## 59 0 0 0 0 0.2769109 0 0
## D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover D.T.part
## 50 0 0 0 0 0 0 0
## 59 0 0 0 0 0 0 0
## D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black D.P.white
## 50 0 0 0 0 0 0 0
## 59 0 0 0 0 0 0 0
## D.P.gold D.P.spacegray
## 50 0 0
## 59 0 0
## [1] "min distance(0.9138) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 352 10872 Y iPadmini#1
## 488 11089 N iPadmini#1
## descr.my
## 352 This item is in Excellent cosmetic condition. It wont have any scratches on the screen. It may
## 488 It works perfect. Just minor scuffs.
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some
## 352 0.1240862 0.160429 0 0 0.2052261 0.4799357 0 0
## 488 0.0000000 0.000000 0 0 0.0000000 0.6799089 0 0
## D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been D.T.broken
## 352 0.2695267 0 0 0 0 0 0
## 488 0.0000000 0 0 0 0 0 0
## D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover D.T.part
## 352 0 0 0 0 0 0 0
## 488 0 0 0 0 0 0 0
## D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black D.P.white
## 352 0 0 0 0 0 0 0
## 488 0 0 0 0 0 0 0
## D.P.gold D.P.spacegray
## 352 0 0
## 488 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "Category: iPadmini2#0"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 3 10004 N iPadmini2#0 0 0
## 101 10148 N iPadmini2#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 3 0 0 0 0 0 0 0 0
## 101 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 3 0 0 0 0 0 0 0
## 101 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 3 0 0 0 0 0 0 0
## 101 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 3 0 0 0 0 0 0 0
## 101 0 0 0 0 0 0 0
## [1] "min distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 3 10004 N iPadmini2#0 0 0
## 3.1 10004 N iPadmini2#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 3 0 0 0 0 0 0 0 0
## 3.1 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 3 0 0 0 0 0 0 0
## 3.1 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 3 0 0 0 0 0 0 0
## 3.1 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 3 0 0 0 0 0 0 0
## 3.1 0 0 0 0 0 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "Category: iPadmini2#1"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 12 10018 N iPadmini2#1
## 20 10026 N iPadmini2#1
## descr.my
## 12 We are selling good quality iPads that have been fully tested by an Apple Certified Technician. The
## 20 iPadmini2 is an open box item, good condition. Doesnt include original box. Include Genuine
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some
## 12 0.0000000 0.0000000 0 0 0 0 0 0
## 20 0.1622666 0.2097918 0 0 0 0 0 0
## D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been D.T.broken
## 12 0.3272824 0 0 0 0 0.3524468 0
## 20 0.0000000 0 0 0 0 0.0000000 0
## D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover D.T.part
## 12 0 0 0 0 0 0 0
## 20 0 0 0 0 0 0 0
## D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black D.P.white
## 12 0 0 0 0 0 0 0
## 20 0 0 0 1 0 0 0
## D.P.gold D.P.spacegray
## 12 0 0
## 20 0 0
## [1] "min distance(0.9166) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 76 10101 Y iPadmini2#1
## 1183 12287 <NA> iPadmini2#1
## descr.my
## 76 This item is in Excellent cosmetic condition. It wont have any scratches on the screen. It may
## 1183 it is LOCKED otherwise good condition.
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of
## 76 0.1240862 0.1604290 0 0 0.2052261 0.4799357 0
## 1183 0.4218932 0.5454586 0 0 0.0000000 0.8158907 0
## D.T.some D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been
## 76 0 0.2695267 0 0 0 0 0
## 1183 0 0.0000000 0 0 0 0 0
## D.T.broken D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover
## 76 0 0 0 0 0 0 0
## 1183 0 0 0 0 0 0 0
## D.T.part D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black
## 76 0 0 0 0 0 0 0
## 1183 0 0 0 0 0 0 0
## D.P.white D.P.gold D.P.spacegray
## 76 0 0 0
## 1183 0 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "Category: iPadmini3#0"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 5 10008 N iPadmini3#0 0 0
## 136 10232 N iPadmini3#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 5 0 0 0 0 0 0 0 0
## 136 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 5 0 0 0 0 0 0 0
## 136 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 5 0 0 0 0 0 0 0
## 136 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 5 0 0 0 0 0 0 0
## 136 0 0 0 0 0 0 0
## [1] "min distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr descr.my D.T.condit D.T.is
## 5 10008 N iPadmini3#0 0 0
## 5.1 10008 N iPadmini3#0 0 0
## D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic
## 5 0 0 0 0 0 0 0 0
## 5.1 0 0 0 0 0 0 0 0
## D.T.for. D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from
## 5 0 0 0 0 0 0 0
## 5.1 0 0 0 0 0 0 0
## D.T.damag D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart
## 5 0 0 0 0 0 0 0
## 5.1 0 0 0 0 0 0 0
## D.P.http D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 5 0 0 0 0 0 0 0
## 5.1 0 0 0 0 0 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "No module detected"
## [1] "Category: iPadmini3#1"
## [1] "max distance(1.0000) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 104 10155 N iPadmini3#1
## 276 10668 N iPadmini3#1
## descr.my D.T.condit D.T.is D.T.has
## 104 This listing doesnt include a charger. 0 0 0
## 276 New Open Box, see pics and description below 0 0 0
## D.T.or D.T.this D.T.it D.T.of D.T.some D.T.have D.T.devic D.T.for.
## 104 0 0.5814741 0 0 0 0 0 0
## 276 0 0.0000000 0 0 0 0 0 0
## D.T.wear D.T.but D.T.been D.T.broken D.T.will D.T.from D.T.damag
## 104 0 0 0 0 0 0 0
## 276 0 0 0 0 0 0 0
## D.T.sign D.T.list D.T.cover D.T.part D.T.tear D.T.smart D.P.http
## 104 0 0.8672404 0 0 0 0 0
## 276 0 0.0000000 0 0 0 0 0
## D.P.mini D.P.air D.P.black D.P.white D.P.gold D.P.spacegray
## 104 0 0 0 0 0 0
## 276 0 0 0 0 0 0
## [1] "min distance(0.9147) pair:"
## UniqueID sold.fctr prdl.descr.my.fctr
## 381 10924 N iPadmini3#1
## 1290 12430 <NA> iPadmini3#1
## descr.my
## 381 Overall excellent condition, very minor signs of use.
## 1290 Excellent condition. Under Applecare November 09, 2015. Working and tested. Minor sign of use.
## D.T.condit D.T.is D.T.has D.T.or D.T.this D.T.it D.T.of D.T.some
## 381 0.2636832 0 0 0 0 0 0.4976312 0
## 1290 0.2343851 0 0 0 0 0 0.4423389 0
## D.T.have D.T.devic D.T.for. D.T.wear D.T.but D.T.been D.T.broken
## 381 0 0 0 0 0 0 0
## 1290 0 0 0 0 0 0 0
## D.T.will D.T.from D.T.damag D.T.sign D.T.list D.T.cover D.T.part
## 381 0 0 0 0.6246239 0 0 0
## 1290 0 0 0 0.5552213 0 0 0
## D.T.tear D.T.smart D.P.http D.P.mini D.P.air D.P.black D.P.white
## 381 0 0 0 0 0 0 0
## 1290 0 0 0 0 0 0 0
## D.P.gold D.P.spacegray
## 381 0 0
## 1290 0 0
## Warning in max(vapply(evaled, length, integer(1))): no non-missing
## arguments to max; returning -Inf
## Source: local data frame [43 x 5]
##
## prdl.descr.my.fctr.clusterid N Y .entropy .knt
## (chr) (dbl) (dbl) (dbl) (dbl)
## 1 Unknown#0_1 52 15 0.5317758 67
## 2 Unknown#1_1 15 4 0.5146532 19
## 3 Unknown#1_2 18 0 0.0000000 18
## 4 Unknown#1_3 4 6 0.6730117 10
## 5 Unknown#1_4 5 1 0.4505612 6
## 6 iPad1#0_1 41 14 0.5672740 55
## 7 iPad1#1_1 19 4 0.4620369 23
## 8 iPad1#1_2 12 2 0.4101163 14
## 9 iPad1#1_3 6 9 0.6730117 15
## 10 iPad2#0_1 41 13 0.5519323 54
## .. ... ... ... ... ...
## [1] "glb_allobs_df$prdl.descr.my.fctr$.clusterid Entropy: 0.4871 (95.3162 pct)"
# Last call for data modifications
#stop(here") # sav_allobs_df <- glb_allobs_df
# glb_allobs_df[(glb_allobs_df$PropR == 0.75) & (glb_allobs_df$State == "Hawaii"), "PropR.fctr"] <- "N"
# Re-partition
glb_trnobs_df <- subset(glb_allobs_df, .src == "Train")
glb_newobs_df <- subset(glb_allobs_df, .src == "Test")
glb_chunks_df <- myadd_chunk(glb_chunks_df, "partition.data.training", major.inc=TRUE)
## label step_major step_minor label_minor bgn
## 7 cluster.data 3 2 2 154.626
## 8 partition.data.training 4 0 0 193.695
## end elapsed
## 7 193.695 39.069
## 8 NA NA
4.0: partition data trainingif (all(is.na(glb_newobs_df[, glb_rsp_var]))) {
set.seed(glb_split_sample.seed)
OOB_size <- nrow(glb_newobs_df) * 1.1
if (is.null(glb_category_var)) {
require(caTools)
split <- sample.split(glb_trnobs_df[, glb_rsp_var_raw],
SplitRatio=OOB_size / nrow(glb_trnobs_df))
glb_OOBobs_df <- glb_trnobs_df[split ,]
glb_fitobs_df <- glb_trnobs_df[!split, ]
} else {
sample_vars <- c(glb_rsp_var_raw, glb_category_var)
rspvar_freq_df <- orderBy(reformulate(glb_rsp_var_raw),
mycreate_sqlxtab_df(glb_trnobs_df, glb_rsp_var_raw))
OOB_rspvar_size <- 1.0 * OOB_size * rspvar_freq_df$.n / sum(rspvar_freq_df$.n)
newobs_freq_df <- orderBy(reformulate(glb_category_var),
mycreate_sqlxtab_df(glb_newobs_df, glb_category_var))
trnobs_freq_df <- orderBy(reformulate(glb_category_var),
mycreate_sqlxtab_df(glb_trnobs_df, glb_category_var))
allobs_freq_df <- merge(newobs_freq_df, trnobs_freq_df, by=glb_category_var,
all=TRUE, sort=TRUE, suffixes=c(".Tst", ".Train"))
allobs_freq_df[is.na(allobs_freq_df)] <- 0
OOB_strata_size <- ceiling(
as.vector(matrix(allobs_freq_df$.n.Tst * 1.0 / sum(allobs_freq_df$.n.Tst)) %*%
matrix(OOB_rspvar_size, nrow = 1)))
OOB_strata_size[OOB_strata_size == 0] <- 1
OOB_strata_df <- expand.grid(glb_rsp_var_raw=rspvar_freq_df[, glb_rsp_var_raw],
glb_category_var=allobs_freq_df[, glb_category_var])
names(OOB_strata_df) <- sample_vars
OOB_strata_df <- orderBy(reformulate(sample_vars), OOB_strata_df)
trnobs_univ_df <- orderBy(reformulate(sample_vars),
mycreate_sqlxtab_df(glb_trnobs_df, sample_vars))
trnobs_univ_df <- merge(trnobs_univ_df, OOB_strata_df, all=TRUE)
tmp_trnobs_df <- orderBy(reformulate(c(glb_rsp_var_raw, glb_category_var)),
glb_trnobs_df)
# Adjust OOB_strata_size (desired # of OOB obs) if > # of trn obs
ix <- which(!is.na(trnobs_univ_df$.n) & (OOB_strata_size > trnobs_univ_df$.n))
OOB_strata_size[ix] <- trnobs_univ_df[ix, ".n"]
require(sampling)
split_strata <- sampling::strata(tmp_trnobs_df,
stratanames = c(glb_rsp_var_raw, glb_category_var),
size = OOB_strata_size[!is.na(trnobs_univ_df$.n)],
method = "srswor")
glb_OOBobs_df <- getdata(tmp_trnobs_df, split_strata)[, names(glb_trnobs_df)]
glb_fitobs_df <- glb_trnobs_df[!glb_trnobs_df[, glb_id_var] %in%
glb_OOBobs_df[, glb_id_var], ]
}
} else {
print(sprintf("Newdata contains non-NA data for %s; setting OOB to Newdata",
glb_rsp_var))
glb_fitobs_df <- glb_trnobs_df; glb_OOBobs_df <- glb_newobs_df
}
## Loading required package: sampling
##
## Attaching package: 'sampling'
##
## The following objects are masked from 'package:survival':
##
## cluster, strata
##
## The following object is masked from 'package:caret':
##
## cluster
if (!is.null(glb_max_fitobs) && (nrow(glb_fitobs_df) > glb_max_fitobs)) {
warning("glb_fitobs_df restricted to glb_max_fitobs: ",
format(glb_max_fitobs, big.mark=","))
org_fitobs_df <- glb_fitobs_df
glb_fitobs_df <-
org_fitobs_df[split <- sample.split(org_fitobs_df[, glb_rsp_var_raw],
SplitRatio=glb_max_fitobs), ]
org_fitobs_df <- NULL
}
if (!is.null(glb_obsfit_outliers)) {
glb_OOBobs_df <- rbind(glb_OOBobs_df,
glb_fitobs_df[glb_fitobs_df[, glb_id_var] %in% glb_obsfit_outliers, ])
glb_fitobs_df <- glb_fitobs_df[!(glb_fitobs_df[, glb_id_var] %in% glb_obsfit_outliers), ]
}
glb_allobs_df$.lcn <- ""; glb_trnobs_df$.lcn <- "";
glb_allobs_df[glb_allobs_df[, glb_id_var] %in%
glb_fitobs_df[, glb_id_var], ".lcn"] <- "Fit"
glb_trnobs_df[glb_trnobs_df[, glb_id_var] %in%
glb_fitobs_df[, glb_id_var], ".lcn"] <- "Fit"
glb_allobs_df[glb_allobs_df[, glb_id_var] %in%
glb_OOBobs_df[, glb_id_var], ".lcn"] <- "OOB"
glb_trnobs_df[glb_trnobs_df[, glb_id_var] %in%
glb_OOBobs_df[, glb_id_var], ".lcn"] <- "OOB"
dsp_class_dstrb <- function(obs_df, location_var, partition_var) {
xtab_df <- mycreate_xtab_df(obs_df, c(location_var, partition_var))
rownames(xtab_df) <- xtab_df[, location_var]
xtab_df <- xtab_df[, -grepl(location_var, names(xtab_df))]
print(xtab_df)
print(xtab_df / rowSums(xtab_df, na.rm=TRUE))
}
# Ensure proper splits by glb_rsp_var_raw & user-specified feature for OOB vs. new
if (!is.null(glb_category_var)) {
if (glb_is_classification)
dsp_class_dstrb(glb_allobs_df, ".lcn", glb_rsp_var_raw)
newobs_ctgry_df <- mycreate_sqlxtab_df(subset(glb_allobs_df, .src == "Test"),
glb_category_var)
OOBobs_ctgry_df <- mycreate_sqlxtab_df(subset(glb_allobs_df, .lcn == "OOB"),
glb_category_var)
glb_ctgry_df <- merge(newobs_ctgry_df, OOBobs_ctgry_df, by=glb_category_var
, all=TRUE, suffixes=c(".Tst", ".OOB"))
glb_ctgry_df$.freqRatio.Tst <- glb_ctgry_df$.n.Tst / sum(glb_ctgry_df$.n.Tst, na.rm=TRUE)
glb_ctgry_df$.freqRatio.OOB <- glb_ctgry_df$.n.OOB / sum(glb_ctgry_df$.n.OOB, na.rm=TRUE)
print(orderBy(~-.freqRatio.Tst-.freqRatio.OOB, glb_ctgry_df))
}
## sold.0 sold.1 sold.NA
## NA NA 422
## Fit 424 107 NA
## OOB 374 110 NA
## sold.0 sold.1 sold.NA
## NA NA 1
## Fit 0.7984934 0.2015066 NA
## OOB 0.7727273 0.2272727 NA
## prdl.descr.my.fctr .n.Tst .n.OOB .freqRatio.Tst .freqRatio.OOB
## 6 iPad2#1 53 59 0.12559242 0.12190083
## 1 Unknown#0 31 35 0.07345972 0.07231405
## 5 iPad2#0 31 35 0.07345972 0.07231405
## 13 iPadAir2#0 28 32 0.06635071 0.06611570
## 16 iPadmini#1 26 31 0.06161137 0.06404959
## 15 iPadmini#0 26 30 0.06161137 0.06198347
## 10 iPad4#1 24 27 0.05687204 0.05578512
## 2 Unknown#1 23 26 0.05450237 0.05371901
## 4 iPad1#1 22 26 0.05213270 0.05371901
## 8 iPad3#1 20 23 0.04739336 0.04752066
## 11 iPadAir#0 19 22 0.04502370 0.04545455
## 12 iPadAir#1 19 22 0.04502370 0.04545455
## 9 iPad4#0 18 21 0.04265403 0.04338843
## 19 iPadmini3#0 18 21 0.04265403 0.04338843
## 3 iPad1#0 16 18 0.03791469 0.03719008
## 17 iPadmini2#0 14 17 0.03317536 0.03512397
## 7 iPad3#0 12 14 0.02843602 0.02892562
## 14 iPadAir2#1 8 9 0.01895735 0.01859504
## 18 iPadmini2#1 8 9 0.01895735 0.01859504
## 20 iPadmini3#1 6 7 0.01421801 0.01446281
print("glb_allobs_df: "); print(dim(glb_allobs_df))
## [1] "glb_allobs_df: "
## [1] 1437 114
print("glb_trnobs_df: "); print(dim(glb_trnobs_df))
## [1] "glb_trnobs_df: "
## [1] 1015 114
print("glb_fitobs_df: "); print(dim(glb_fitobs_df))
## [1] "glb_fitobs_df: "
## [1] 531 113
print("glb_OOBobs_df: "); print(dim(glb_OOBobs_df))
## [1] "glb_OOBobs_df: "
## [1] 484 113
print("glb_newobs_df: "); print(dim(glb_newobs_df))
## [1] "glb_newobs_df: "
## [1] 422 113
# # Does not handle NULL or length(glb_id_var) > 1
if (glb_save_envir)
save(glb_allobs_df, #glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
file=paste0(glb_out_pfx, "blddfs_dsk.RData"))
# load(paste0(glb_out_pfx, "blddfs_dsk.RData"))
rm(split)
## Warning in rm(split): object 'split' not found
glb_chunks_df <- myadd_chunk(glb_chunks_df, "select.features", major.inc=TRUE)
## label step_major step_minor label_minor bgn
## 8 partition.data.training 4 0 0 193.695
## 9 select.features 5 0 0 194.566
## end elapsed
## 8 194.566 0.871
## 9 NA NA
5.0: select features#stop(here"); glb_to_sav(); glb_allobs_df <- sav_allobs_df
print(glb_feats_df <- myselect_features(entity_df=glb_trnobs_df,
exclude_vars_as_features=glbFeatsExclude,
rsp_var=glb_rsp_var))
## Warning in cor(data.matrix(entity_df[, sel_feats]), y =
## as.numeric(entity_df[, : the standard deviation is zero
## id cor.y
## sold sold 1.000000000
## spdiff.cut.fctr spdiff.cut.fctr 0.184272206
## sprice.root2 sprice.root2 -0.158256112
## sprice.predict.diff sprice.predict.diff 0.156628104
## startprice startprice -0.153924173
## sprice.log10 sprice.log10 -0.150032840
## D.chrs.pnct15.n.log D.chrs.pnct15.n.log 0.128054077
## D.T.of D.T.of 0.096131208
## D.chrs.pnct06.n.log D.chrs.pnct06.n.log -0.092298979
## D.T.broken D.T.broken 0.091379799
## D.T.it D.T.it 0.090364752
## D.chrs.pnct05.n.log D.chrs.pnct05.n.log -0.084210486
## D.T.cover D.T.cover 0.084037929
## D.chrs.pnct16.n.log D.chrs.pnct16.n.log 0.082266220
## D.T.has D.T.has 0.081745956
## D.T.sign D.T.sign 0.079164120
## startprice.log10.predict startprice.log10.predict -0.078370786
## D.T.list D.T.list -0.072729364
## condition.fctr condition.fctr -0.071404008
## D.T.for. D.T.for. 0.070528147
## D.T.part D.T.part 0.062899486
## D.terms.n.stem.stop.Ratio D.terms.n.stem.stop.Ratio 0.061427272
## startprice.dcm1.is9 startprice.dcm1.is9 -0.053556098
## sprice.d20nexp sprice.d20nexp 0.050688215
## D.chrs.pnct12.n.log D.chrs.pnct12.n.log -0.047796912
## D.chrs.pnct09.n.log D.chrs.pnct09.n.log -0.047112729
## D.T.some D.T.some 0.044306219
## D.T.tear D.T.tear 0.043092211
## D.chrs.pnct08.n.log D.chrs.pnct08.n.log -0.043039010
## D.dgts.n.log D.dgts.n.log -0.042077589
## D.chrs.pnct13.n.log D.chrs.pnct13.n.log -0.041480651
## D.T.wear D.T.wear 0.040531333
## D.chrs.pnct28.n.log D.chrs.pnct28.n.log -0.039954067
## color.fctr color.fctr -0.038964633
## .clusterid .clusterid 0.038727962
## .clusterid.fctr .clusterid.fctr 0.038727962
## D.terms.post.stop.n D.terms.post.stop.n -0.038467757
## D.chrs.pnct14.n.log D.chrs.pnct14.n.log -0.037263745
## storage.fctr storage.fctr 0.037263476
## D.terms.post.stem.n D.terms.post.stem.n -0.037108281
## startprice.dcm2.is9 startprice.dcm2.is9 -0.036879506
## D.chrs.pnct11.n.log D.chrs.pnct11.n.log -0.034402855
## D.terms.post.stop.n.log D.terms.post.stop.n.log -0.033752177
## prdl.descr.my.fctr prdl.descr.my.fctr -0.033343866
## D.terms.post.stem.n.log D.terms.post.stem.n.log -0.033160500
## D.wrds.unq.n.log D.wrds.unq.n.log -0.033160500
## startprice.dgt3.is9 startprice.dgt3.is9 -0.032800679
## D.chrs.n.log D.chrs.n.log -0.032754921
## D.T.will D.T.will -0.032522829
## D.ratio.wrds.stop.n.wrds.n D.ratio.wrds.stop.n.wrds.n 0.032325360
## D.chrs.uppr.n.log D.chrs.uppr.n.log -0.032193462
## D.wrds.n.log D.wrds.n.log -0.029259294
## D.T.from D.T.from -0.025989943
## D.chrs.pnct01.n.log D.chrs.pnct01.n.log -0.024885276
## carrier.fctr carrier.fctr -0.021728112
## startprice.dgt1.is9 startprice.dgt1.is9 -0.021465682
## D.P.air D.P.air -0.021410325
## D.T.smart D.T.smart 0.021146840
## D.T.devic D.T.devic 0.020563983
## D.weight.post.stem.sum D.weight.post.stem.sum -0.019167392
## D.weight.sum D.weight.sum -0.019167392
## D.ratio.weight.sum.wrds.n D.ratio.weight.sum.wrds.n 0.018621089
## D.weight.post.stop.sum D.weight.post.stop.sum -0.017806251
## D.T.have D.T.have 0.017801186
## D.chrs.pnct03.n.log D.chrs.pnct03.n.log -0.016376061
## D.chrs.pnct18.n.log D.chrs.pnct18.n.log -0.016376061
## D.P.gold D.P.gold -0.016376061
## D.P.spacegray D.P.spacegray -0.016376061
## D.chrs.pnct07.n.log D.chrs.pnct07.n.log -0.016366426
## UniqueID UniqueID -0.015687428
## D.P.mini D.P.mini 0.013828986
## .rnorm .rnorm 0.013801593
## D.T.condit D.T.condit -0.013082326
## D.T.but D.T.but 0.011210903
## D.chrs.pnct10.n.log D.chrs.pnct10.n.log 0.009962551
## D.T.this D.T.this -0.008850439
## startprice.dgt2.is9 startprice.dgt2.is9 0.008359669
## D.T.is D.T.is 0.007630386
## cellular.fctr cellular.fctr -0.006974300
## D.T.been D.T.been -0.005901474
## D.P.black D.P.black 0.005554954
## D.P.white D.P.white 0.005554954
## D.T.damag D.T.damag -0.005225674
## D.weight.sum.stem.stop.Ratio D.weight.sum.stem.stop.Ratio -0.003205858
## D.T.or D.T.or -0.002368073
## D.wrds.stop.n.log D.wrds.stop.n.log -0.001512415
## biddable biddable NA
## D.chrs.pnct02.n.log D.chrs.pnct02.n.log NA
## D.chrs.pnct04.n.log D.chrs.pnct04.n.log NA
## D.chrs.pnct17.n.log D.chrs.pnct17.n.log NA
## D.chrs.pnct19.n.log D.chrs.pnct19.n.log NA
## D.chrs.pnct20.n.log D.chrs.pnct20.n.log NA
## D.chrs.pnct21.n.log D.chrs.pnct21.n.log NA
## D.chrs.pnct22.n.log D.chrs.pnct22.n.log NA
## D.chrs.pnct23.n.log D.chrs.pnct23.n.log NA
## D.chrs.pnct24.n.log D.chrs.pnct24.n.log NA
## D.chrs.pnct25.n.log D.chrs.pnct25.n.log NA
## D.chrs.pnct26.n.log D.chrs.pnct26.n.log NA
## D.chrs.pnct27.n.log D.chrs.pnct27.n.log NA
## D.chrs.pnct29.n.log D.chrs.pnct29.n.log NA
## D.chrs.pnct30.n.log D.chrs.pnct30.n.log NA
## D.P.http D.P.http NA
## exclude.as.feat cor.y.abs
## sold 1 1.000000000
## spdiff.cut.fctr 0 0.184272206
## sprice.root2 0 0.158256112
## sprice.predict.diff 1 0.156628104
## startprice 1 0.153924173
## sprice.log10 0 0.150032840
## D.chrs.pnct15.n.log 0 0.128054077
## D.T.of 0 0.096131208
## D.chrs.pnct06.n.log 0 0.092298979
## D.T.broken 0 0.091379799
## D.T.it 0 0.090364752
## D.chrs.pnct05.n.log 0 0.084210486
## D.T.cover 0 0.084037929
## D.chrs.pnct16.n.log 0 0.082266220
## D.T.has 0 0.081745956
## D.T.sign 0 0.079164120
## startprice.log10.predict 1 0.078370786
## D.T.list 0 0.072729364
## condition.fctr 0 0.071404008
## D.T.for. 0 0.070528147
## D.T.part 0 0.062899486
## D.terms.n.stem.stop.Ratio 0 0.061427272
## startprice.dcm1.is9 0 0.053556098
## sprice.d20nexp 0 0.050688215
## D.chrs.pnct12.n.log 0 0.047796912
## D.chrs.pnct09.n.log 0 0.047112729
## D.T.some 0 0.044306219
## D.T.tear 0 0.043092211
## D.chrs.pnct08.n.log 0 0.043039010
## D.dgts.n.log 0 0.042077589
## D.chrs.pnct13.n.log 0 0.041480651
## D.T.wear 0 0.040531333
## D.chrs.pnct28.n.log 0 0.039954067
## color.fctr 0 0.038964633
## .clusterid 1 0.038727962
## .clusterid.fctr 0 0.038727962
## D.terms.post.stop.n 1 0.038467757
## D.chrs.pnct14.n.log 0 0.037263745
## storage.fctr 0 0.037263476
## D.terms.post.stem.n 1 0.037108281
## startprice.dcm2.is9 0 0.036879506
## D.chrs.pnct11.n.log 0 0.034402855
## D.terms.post.stop.n.log 0 0.033752177
## prdl.descr.my.fctr 0 0.033343866
## D.terms.post.stem.n.log 0 0.033160500
## D.wrds.unq.n.log 0 0.033160500
## startprice.dgt3.is9 0 0.032800679
## D.chrs.n.log 0 0.032754921
## D.T.will 0 0.032522829
## D.ratio.wrds.stop.n.wrds.n 0 0.032325360
## D.chrs.uppr.n.log 0 0.032193462
## D.wrds.n.log 0 0.029259294
## D.T.from 0 0.025989943
## D.chrs.pnct01.n.log 0 0.024885276
## carrier.fctr 0 0.021728112
## startprice.dgt1.is9 0 0.021465682
## D.P.air 0 0.021410325
## D.T.smart 0 0.021146840
## D.T.devic 0 0.020563983
## D.weight.post.stem.sum 0 0.019167392
## D.weight.sum 0 0.019167392
## D.ratio.weight.sum.wrds.n 0 0.018621089
## D.weight.post.stop.sum 0 0.017806251
## D.T.have 0 0.017801186
## D.chrs.pnct03.n.log 0 0.016376061
## D.chrs.pnct18.n.log 0 0.016376061
## D.P.gold 0 0.016376061
## D.P.spacegray 0 0.016376061
## D.chrs.pnct07.n.log 0 0.016366426
## UniqueID 1 0.015687428
## D.P.mini 0 0.013828986
## .rnorm 0 0.013801593
## D.T.condit 0 0.013082326
## D.T.but 0 0.011210903
## D.chrs.pnct10.n.log 0 0.009962551
## D.T.this 0 0.008850439
## startprice.dgt2.is9 0 0.008359669
## D.T.is 0 0.007630386
## cellular.fctr 0 0.006974300
## D.T.been 0 0.005901474
## D.P.black 0 0.005554954
## D.P.white 0 0.005554954
## D.T.damag 0 0.005225674
## D.weight.sum.stem.stop.Ratio 0 0.003205858
## D.T.or 0 0.002368073
## D.wrds.stop.n.log 0 0.001512415
## biddable 0 NA
## D.chrs.pnct02.n.log 0 NA
## D.chrs.pnct04.n.log 0 NA
## D.chrs.pnct17.n.log 0 NA
## D.chrs.pnct19.n.log 0 NA
## D.chrs.pnct20.n.log 0 NA
## D.chrs.pnct21.n.log 0 NA
## D.chrs.pnct22.n.log 0 NA
## D.chrs.pnct23.n.log 0 NA
## D.chrs.pnct24.n.log 0 NA
## D.chrs.pnct25.n.log 0 NA
## D.chrs.pnct26.n.log 0 NA
## D.chrs.pnct27.n.log 0 NA
## D.chrs.pnct29.n.log 0 NA
## D.chrs.pnct30.n.log 0 NA
## D.P.http 0 NA
print(glb_feats_df <- orderBy(~-cor.y,
myfind_cor_features(feats_df=glb_feats_df, obs_df=glb_trnobs_df, rsp_var=glb_rsp_var,
nzv.freqCut=glb_nzv_freqCut, nzv.uniqueCut=glb_nzv_uniqueCut)))
## [1] "cor(D.terms.post.stem.n.log, D.wrds.unq.n.log)=1.0000"
## [1] "cor(sold.fctr, D.terms.post.stem.n.log)=-0.0332"
## [1] "cor(sold.fctr, D.wrds.unq.n.log)=-0.0332"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified D.wrds.unq.n.log as highly correlated with
## D.terms.post.stem.n.log
## [1] "cor(D.weight.post.stem.sum, D.weight.sum)=1.0000"
## [1] "cor(sold.fctr, D.weight.post.stem.sum)=-0.0192"
## [1] "cor(sold.fctr, D.weight.sum)=-0.0192"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified D.weight.sum as highly correlated with
## D.weight.post.stem.sum
## [1] "cor(D.terms.post.stem.n.log, D.terms.post.stop.n.log)=0.9999"
## [1] "cor(sold.fctr, D.terms.post.stem.n.log)=-0.0332"
## [1] "cor(sold.fctr, D.terms.post.stop.n.log)=-0.0338"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified D.terms.post.stem.n.log as highly correlated
## with D.terms.post.stop.n.log
## [1] "cor(D.chrs.n.log, D.chrs.uppr.n.log)=0.9993"
## [1] "cor(sold.fctr, D.chrs.n.log)=-0.0328"
## [1] "cor(sold.fctr, D.chrs.uppr.n.log)=-0.0322"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified D.chrs.uppr.n.log as highly correlated with
## D.chrs.n.log
## [1] "cor(D.weight.post.stem.sum, D.weight.post.stop.sum)=0.9980"
## [1] "cor(sold.fctr, D.weight.post.stem.sum)=-0.0192"
## [1] "cor(sold.fctr, D.weight.post.stop.sum)=-0.0178"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified D.weight.post.stop.sum as highly correlated
## with D.weight.post.stem.sum
## [1] "cor(D.terms.post.stop.n.log, D.wrds.n.log)=0.9960"
## [1] "cor(sold.fctr, D.terms.post.stop.n.log)=-0.0338"
## [1] "cor(sold.fctr, D.wrds.n.log)=-0.0293"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified D.wrds.n.log as highly correlated with
## D.terms.post.stop.n.log
## [1] "cor(D.chrs.n.log, D.terms.post.stop.n.log)=0.9895"
## [1] "cor(sold.fctr, D.chrs.n.log)=-0.0328"
## [1] "cor(sold.fctr, D.terms.post.stop.n.log)=-0.0338"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified D.chrs.n.log as highly correlated with
## D.terms.post.stop.n.log
## [1] "cor(D.ratio.wrds.stop.n.wrds.n, D.terms.post.stop.n.log)=-0.9756"
## [1] "cor(sold.fctr, D.ratio.wrds.stop.n.wrds.n)=0.0323"
## [1] "cor(sold.fctr, D.terms.post.stop.n.log)=-0.0338"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified D.ratio.wrds.stop.n.wrds.n as highly correlated
## with D.terms.post.stop.n.log
## [1] "cor(sprice.log10, sprice.root2)=0.9521"
## [1] "cor(sold.fctr, sprice.log10)=-0.1500"
## [1] "cor(sold.fctr, sprice.root2)=-0.1583"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified sprice.log10 as highly correlated with
## sprice.root2
## [1] "cor(D.terms.post.stop.n.log, D.weight.post.stem.sum)=0.9196"
## [1] "cor(sold.fctr, D.terms.post.stop.n.log)=-0.0338"
## [1] "cor(sold.fctr, D.weight.post.stem.sum)=-0.0192"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified D.weight.post.stem.sum as highly correlated
## with D.terms.post.stop.n.log
## [1] "cor(startprice.dcm1.is9, startprice.dcm2.is9)=0.7421"
## [1] "cor(sold.fctr, startprice.dcm1.is9)=-0.0536"
## [1] "cor(sold.fctr, startprice.dcm2.is9)=-0.0369"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified startprice.dcm2.is9 as highly correlated with
## startprice.dcm1.is9
## [1] "cor(D.chrs.pnct05.n.log, D.dgts.n.log)=0.7419"
## [1] "cor(sold.fctr, D.chrs.pnct05.n.log)=-0.0842"
## [1] "cor(sold.fctr, D.dgts.n.log)=-0.0421"
## Warning in myfind_cor_features(feats_df = glb_feats_df, obs_df =
## glb_trnobs_df, : Identified D.dgts.n.log as highly correlated with
## D.chrs.pnct05.n.log
## id cor.y
## sold sold 1.000000000
## spdiff.cut.fctr spdiff.cut.fctr 0.184272206
## sprice.predict.diff sprice.predict.diff 0.156628104
## D.chrs.pnct15.n.log D.chrs.pnct15.n.log 0.128054077
## D.T.of D.T.of 0.096131208
## D.T.broken D.T.broken 0.091379799
## D.T.it D.T.it 0.090364752
## D.T.cover D.T.cover 0.084037929
## D.chrs.pnct16.n.log D.chrs.pnct16.n.log 0.082266220
## D.T.has D.T.has 0.081745956
## D.T.sign D.T.sign 0.079164120
## D.T.for. D.T.for. 0.070528147
## D.T.part D.T.part 0.062899486
## D.terms.n.stem.stop.Ratio D.terms.n.stem.stop.Ratio 0.061427272
## sprice.d20nexp sprice.d20nexp 0.050688215
## D.T.some D.T.some 0.044306219
## D.T.tear D.T.tear 0.043092211
## D.T.wear D.T.wear 0.040531333
## .clusterid .clusterid 0.038727962
## .clusterid.fctr .clusterid.fctr 0.038727962
## storage.fctr storage.fctr 0.037263476
## D.ratio.wrds.stop.n.wrds.n D.ratio.wrds.stop.n.wrds.n 0.032325360
## D.T.smart D.T.smart 0.021146840
## D.T.devic D.T.devic 0.020563983
## D.ratio.weight.sum.wrds.n D.ratio.weight.sum.wrds.n 0.018621089
## D.T.have D.T.have 0.017801186
## D.P.mini D.P.mini 0.013828986
## .rnorm .rnorm 0.013801593
## D.T.but D.T.but 0.011210903
## D.chrs.pnct10.n.log D.chrs.pnct10.n.log 0.009962551
## startprice.dgt2.is9 startprice.dgt2.is9 0.008359669
## D.T.is D.T.is 0.007630386
## D.P.black D.P.black 0.005554954
## D.P.white D.P.white 0.005554954
## D.wrds.stop.n.log D.wrds.stop.n.log -0.001512415
## D.T.or D.T.or -0.002368073
## D.weight.sum.stem.stop.Ratio D.weight.sum.stem.stop.Ratio -0.003205858
## D.T.damag D.T.damag -0.005225674
## D.T.been D.T.been -0.005901474
## cellular.fctr cellular.fctr -0.006974300
## D.T.this D.T.this -0.008850439
## D.T.condit D.T.condit -0.013082326
## UniqueID UniqueID -0.015687428
## D.chrs.pnct07.n.log D.chrs.pnct07.n.log -0.016366426
## D.P.gold D.P.gold -0.016376061
## D.P.spacegray D.P.spacegray -0.016376061
## D.chrs.pnct03.n.log D.chrs.pnct03.n.log -0.016376061
## D.chrs.pnct18.n.log D.chrs.pnct18.n.log -0.016376061
## D.weight.post.stop.sum D.weight.post.stop.sum -0.017806251
## D.weight.post.stem.sum D.weight.post.stem.sum -0.019167392
## D.weight.sum D.weight.sum -0.019167392
## D.P.air D.P.air -0.021410325
## startprice.dgt1.is9 startprice.dgt1.is9 -0.021465682
## carrier.fctr carrier.fctr -0.021728112
## D.chrs.pnct01.n.log D.chrs.pnct01.n.log -0.024885276
## D.T.from D.T.from -0.025989943
## D.wrds.n.log D.wrds.n.log -0.029259294
## D.chrs.uppr.n.log D.chrs.uppr.n.log -0.032193462
## D.T.will D.T.will -0.032522829
## D.chrs.n.log D.chrs.n.log -0.032754921
## startprice.dgt3.is9 startprice.dgt3.is9 -0.032800679
## D.terms.post.stem.n.log D.terms.post.stem.n.log -0.033160500
## D.wrds.unq.n.log D.wrds.unq.n.log -0.033160500
## prdl.descr.my.fctr prdl.descr.my.fctr -0.033343866
## D.terms.post.stop.n.log D.terms.post.stop.n.log -0.033752177
## D.chrs.pnct11.n.log D.chrs.pnct11.n.log -0.034402855
## startprice.dcm2.is9 startprice.dcm2.is9 -0.036879506
## D.terms.post.stem.n D.terms.post.stem.n -0.037108281
## D.chrs.pnct14.n.log D.chrs.pnct14.n.log -0.037263745
## D.terms.post.stop.n D.terms.post.stop.n -0.038467757
## color.fctr color.fctr -0.038964633
## D.chrs.pnct28.n.log D.chrs.pnct28.n.log -0.039954067
## D.chrs.pnct13.n.log D.chrs.pnct13.n.log -0.041480651
## D.dgts.n.log D.dgts.n.log -0.042077589
## D.chrs.pnct08.n.log D.chrs.pnct08.n.log -0.043039010
## D.chrs.pnct09.n.log D.chrs.pnct09.n.log -0.047112729
## D.chrs.pnct12.n.log D.chrs.pnct12.n.log -0.047796912
## startprice.dcm1.is9 startprice.dcm1.is9 -0.053556098
## condition.fctr condition.fctr -0.071404008
## D.T.list D.T.list -0.072729364
## startprice.log10.predict startprice.log10.predict -0.078370786
## D.chrs.pnct05.n.log D.chrs.pnct05.n.log -0.084210486
## D.chrs.pnct06.n.log D.chrs.pnct06.n.log -0.092298979
## sprice.log10 sprice.log10 -0.150032840
## startprice startprice -0.153924173
## sprice.root2 sprice.root2 -0.158256112
## D.P.http D.P.http NA
## D.chrs.pnct02.n.log D.chrs.pnct02.n.log NA
## D.chrs.pnct04.n.log D.chrs.pnct04.n.log NA
## D.chrs.pnct17.n.log D.chrs.pnct17.n.log NA
## D.chrs.pnct19.n.log D.chrs.pnct19.n.log NA
## D.chrs.pnct20.n.log D.chrs.pnct20.n.log NA
## D.chrs.pnct21.n.log D.chrs.pnct21.n.log NA
## D.chrs.pnct22.n.log D.chrs.pnct22.n.log NA
## D.chrs.pnct23.n.log D.chrs.pnct23.n.log NA
## D.chrs.pnct24.n.log D.chrs.pnct24.n.log NA
## D.chrs.pnct25.n.log D.chrs.pnct25.n.log NA
## D.chrs.pnct26.n.log D.chrs.pnct26.n.log NA
## D.chrs.pnct27.n.log D.chrs.pnct27.n.log NA
## D.chrs.pnct29.n.log D.chrs.pnct29.n.log NA
## D.chrs.pnct30.n.log D.chrs.pnct30.n.log NA
## biddable biddable NA
## exclude.as.feat cor.y.abs
## sold 1 1.000000000
## spdiff.cut.fctr 0 0.184272206
## sprice.predict.diff 1 0.156628104
## D.chrs.pnct15.n.log 0 0.128054077
## D.T.of 0 0.096131208
## D.T.broken 0 0.091379799
## D.T.it 0 0.090364752
## D.T.cover 0 0.084037929
## D.chrs.pnct16.n.log 0 0.082266220
## D.T.has 0 0.081745956
## D.T.sign 0 0.079164120
## D.T.for. 0 0.070528147
## D.T.part 0 0.062899486
## D.terms.n.stem.stop.Ratio 0 0.061427272
## sprice.d20nexp 0 0.050688215
## D.T.some 0 0.044306219
## D.T.tear 0 0.043092211
## D.T.wear 0 0.040531333
## .clusterid 1 0.038727962
## .clusterid.fctr 0 0.038727962
## storage.fctr 0 0.037263476
## D.ratio.wrds.stop.n.wrds.n 0 0.032325360
## D.T.smart 0 0.021146840
## D.T.devic 0 0.020563983
## D.ratio.weight.sum.wrds.n 0 0.018621089
## D.T.have 0 0.017801186
## D.P.mini 0 0.013828986
## .rnorm 0 0.013801593
## D.T.but 0 0.011210903
## D.chrs.pnct10.n.log 0 0.009962551
## startprice.dgt2.is9 0 0.008359669
## D.T.is 0 0.007630386
## D.P.black 0 0.005554954
## D.P.white 0 0.005554954
## D.wrds.stop.n.log 0 0.001512415
## D.T.or 0 0.002368073
## D.weight.sum.stem.stop.Ratio 0 0.003205858
## D.T.damag 0 0.005225674
## D.T.been 0 0.005901474
## cellular.fctr 0 0.006974300
## D.T.this 0 0.008850439
## D.T.condit 0 0.013082326
## UniqueID 1 0.015687428
## D.chrs.pnct07.n.log 0 0.016366426
## D.P.gold 0 0.016376061
## D.P.spacegray 0 0.016376061
## D.chrs.pnct03.n.log 0 0.016376061
## D.chrs.pnct18.n.log 0 0.016376061
## D.weight.post.stop.sum 0 0.017806251
## D.weight.post.stem.sum 0 0.019167392
## D.weight.sum 0 0.019167392
## D.P.air 0 0.021410325
## startprice.dgt1.is9 0 0.021465682
## carrier.fctr 0 0.021728112
## D.chrs.pnct01.n.log 0 0.024885276
## D.T.from 0 0.025989943
## D.wrds.n.log 0 0.029259294
## D.chrs.uppr.n.log 0 0.032193462
## D.T.will 0 0.032522829
## D.chrs.n.log 0 0.032754921
## startprice.dgt3.is9 0 0.032800679
## D.terms.post.stem.n.log 0 0.033160500
## D.wrds.unq.n.log 0 0.033160500
## prdl.descr.my.fctr 0 0.033343866
## D.terms.post.stop.n.log 0 0.033752177
## D.chrs.pnct11.n.log 0 0.034402855
## startprice.dcm2.is9 0 0.036879506
## D.terms.post.stem.n 1 0.037108281
## D.chrs.pnct14.n.log 0 0.037263745
## D.terms.post.stop.n 1 0.038467757
## color.fctr 0 0.038964633
## D.chrs.pnct28.n.log 0 0.039954067
## D.chrs.pnct13.n.log 0 0.041480651
## D.dgts.n.log 0 0.042077589
## D.chrs.pnct08.n.log 0 0.043039010
## D.chrs.pnct09.n.log 0 0.047112729
## D.chrs.pnct12.n.log 0 0.047796912
## startprice.dcm1.is9 0 0.053556098
## condition.fctr 0 0.071404008
## D.T.list 0 0.072729364
## startprice.log10.predict 1 0.078370786
## D.chrs.pnct05.n.log 0 0.084210486
## D.chrs.pnct06.n.log 0 0.092298979
## sprice.log10 0 0.150032840
## startprice 1 0.153924173
## sprice.root2 0 0.158256112
## D.P.http 0 NA
## D.chrs.pnct02.n.log 0 NA
## D.chrs.pnct04.n.log 0 NA
## D.chrs.pnct17.n.log 0 NA
## D.chrs.pnct19.n.log 0 NA
## D.chrs.pnct20.n.log 0 NA
## D.chrs.pnct21.n.log 0 NA
## D.chrs.pnct22.n.log 0 NA
## D.chrs.pnct23.n.log 0 NA
## D.chrs.pnct24.n.log 0 NA
## D.chrs.pnct25.n.log 0 NA
## D.chrs.pnct26.n.log 0 NA
## D.chrs.pnct27.n.log 0 NA
## D.chrs.pnct29.n.log 0 NA
## D.chrs.pnct30.n.log 0 NA
## biddable 0 NA
## cor.high.X freqRatio
## sold <NA> 3.677419
## spdiff.cut.fctr <NA> 1.930000
## sprice.predict.diff <NA> 1.000000
## D.chrs.pnct15.n.log <NA> 143.142857
## D.T.of <NA> 95.100000
## D.T.broken <NA> 251.000000
## D.T.it <NA> 137.000000
## D.T.cover <NA> 502.000000
## D.chrs.pnct16.n.log <NA> 252.750000
## D.T.has <NA> 95.100000
## D.T.sign <NA> 140.285714
## D.T.for. <NA> 198.000000
## D.T.part <NA> 251.250000
## D.terms.n.stem.stop.Ratio <NA> 54.222222
## sprice.d20nexp <NA> 1.312500
## D.T.some <NA> 195.200000
## D.T.tear <NA> 250.250000
## D.T.wear <NA> 138.714286
## .clusterid <NA> 5.388889
## .clusterid.fctr <NA> 5.388889
## storage.fctr <NA> 2.719101
## D.ratio.wrds.stop.n.wrds.n D.terms.post.stop.n.log 16.147059
## D.T.smart <NA> 336.000000
## D.T.devic <NA> 48.250000
## D.ratio.weight.sum.wrds.n <NA> 32.294118
## D.T.have <NA> 139.714286
## D.P.mini <NA> 111.666667
## .rnorm <NA> 1.000000
## D.T.but <NA> 121.875000
## D.chrs.pnct10.n.log <NA> 168.000000
## startprice.dgt2.is9 <NA> 6.048611
## D.T.is <NA> 43.750000
## D.P.black <NA> 252.750000
## D.P.white <NA> 252.750000
## D.wrds.stop.n.log <NA> 8.222222
## D.T.or <NA> 50.157895
## D.weight.sum.stem.stop.Ratio <NA> 33.411765
## D.T.damag <NA> 251.250000
## D.T.been <NA> 140.285714
## cellular.fctr <NA> 1.779180
## D.T.this <NA> 54.882353
## D.T.condit <NA> 22.228571
## UniqueID <NA> 1.000000
## D.chrs.pnct07.n.log <NA> 168.000000
## D.P.gold <NA> 1014.000000
## D.P.spacegray <NA> 1014.000000
## D.chrs.pnct03.n.log <NA> 1014.000000
## D.chrs.pnct18.n.log <NA> 1014.000000
## D.weight.post.stop.sum D.weight.post.stem.sum 32.294118
## D.weight.post.stem.sum D.terms.post.stop.n.log 32.294118
## D.weight.sum D.weight.post.stem.sum 32.294118
## D.P.air <NA> 143.857143
## startprice.dgt1.is9 <NA> 1.512376
## carrier.fctr <NA> 2.792079
## D.chrs.pnct01.n.log <NA> 61.562500
## D.T.from <NA> 51.736842
## D.wrds.n.log D.terms.post.stop.n.log 8.546875
## D.chrs.uppr.n.log D.chrs.n.log 12.477273
## D.T.will <NA> 249.750000
## D.chrs.n.log D.terms.post.stop.n.log 11.395833
## startprice.dgt3.is9 <NA> 252.750000
## D.terms.post.stem.n.log D.terms.post.stop.n.log 8.318182
## D.wrds.unq.n.log D.terms.post.stem.n.log 8.318182
## prdl.descr.my.fctr <NA> 1.010989
## D.terms.post.stop.n.log <NA> 10.557692
## D.chrs.pnct11.n.log <NA> 9.797753
## startprice.dcm2.is9 startprice.dcm1.is9 2.284790
## D.terms.post.stem.n <NA> 8.318182
## D.chrs.pnct14.n.log <NA> 25.578947
## D.terms.post.stop.n <NA> 10.557692
## color.fctr <NA> 1.564516
## D.chrs.pnct28.n.log <NA> 252.250000
## D.chrs.pnct13.n.log <NA> 4.664516
## D.dgts.n.log D.chrs.pnct05.n.log 22.414634
## D.chrs.pnct08.n.log <NA> 42.478261
## D.chrs.pnct09.n.log <NA> 167.666667
## D.chrs.pnct12.n.log <NA> 29.242424
## startprice.dcm1.is9 <NA> 1.556675
## condition.fctr <NA> 2.823529
## D.T.list <NA> 51.947368
## startprice.log10.predict <NA> 1.000000
## D.chrs.pnct05.n.log <NA> 23.166667
## D.chrs.pnct06.n.log <NA> 36.407407
## sprice.log10 sprice.root2 1.312500
## startprice <NA> 1.312500
## sprice.root2 <NA> 1.312500
## D.P.http <NA> 0.000000
## D.chrs.pnct02.n.log <NA> 0.000000
## D.chrs.pnct04.n.log <NA> 0.000000
## D.chrs.pnct17.n.log <NA> 0.000000
## D.chrs.pnct19.n.log <NA> 0.000000
## D.chrs.pnct20.n.log <NA> 0.000000
## D.chrs.pnct21.n.log <NA> 0.000000
## D.chrs.pnct22.n.log <NA> 0.000000
## D.chrs.pnct23.n.log <NA> 0.000000
## D.chrs.pnct24.n.log <NA> 0.000000
## D.chrs.pnct25.n.log <NA> 0.000000
## D.chrs.pnct26.n.log <NA> 0.000000
## D.chrs.pnct27.n.log <NA> 0.000000
## D.chrs.pnct29.n.log <NA> 0.000000
## D.chrs.pnct30.n.log <NA> 0.000000
## biddable <NA> 0.000000
## percentUnique zeroVar nzv is.cor.y.abs.low
## sold 0.19704433 FALSE FALSE FALSE
## spdiff.cut.fctr 0.78817734 FALSE FALSE FALSE
## sprice.predict.diff 100.00000000 FALSE FALSE FALSE
## D.chrs.pnct15.n.log 0.29556650 FALSE TRUE FALSE
## D.T.of 1.77339901 FALSE TRUE FALSE
## D.T.broken 0.78817734 FALSE TRUE FALSE
## D.T.it 1.97044335 FALSE TRUE FALSE
## D.T.cover 0.98522167 FALSE TRUE FALSE
## D.chrs.pnct16.n.log 0.19704433 FALSE TRUE FALSE
## D.T.has 1.67487685 FALSE TRUE FALSE
## D.T.sign 1.28078818 FALSE TRUE FALSE
## D.T.for. 1.37931034 FALSE TRUE FALSE
## D.T.part 0.68965517 FALSE TRUE FALSE
## D.terms.n.stem.stop.Ratio 0.98522167 FALSE TRUE FALSE
## sprice.d20nexp 46.20689655 FALSE FALSE FALSE
## D.T.some 1.57635468 FALSE TRUE FALSE
## D.T.tear 0.78817734 FALSE TRUE FALSE
## D.T.wear 1.28078818 FALSE TRUE FALSE
## .clusterid 0.39408867 FALSE FALSE FALSE
## .clusterid.fctr 0.39408867 FALSE FALSE FALSE
## storage.fctr 0.49261084 FALSE FALSE FALSE
## D.ratio.wrds.stop.n.wrds.n 6.50246305 FALSE FALSE FALSE
## D.T.smart 0.59113300 FALSE TRUE FALSE
## D.T.devic 1.18226601 FALSE TRUE FALSE
## D.ratio.weight.sum.wrds.n 36.45320197 FALSE FALSE FALSE
## D.T.have 1.18226601 FALSE TRUE FALSE
## D.P.mini 0.29556650 FALSE TRUE FALSE
## .rnorm 100.00000000 FALSE FALSE FALSE
## D.T.but 1.28078818 FALSE TRUE TRUE
## D.chrs.pnct10.n.log 0.29556650 FALSE TRUE TRUE
## startprice.dgt2.is9 0.19704433 FALSE FALSE TRUE
## D.T.is 2.85714286 FALSE TRUE TRUE
## D.P.black 0.19704433 FALSE TRUE TRUE
## D.P.white 0.19704433 FALSE TRUE TRUE
## D.wrds.stop.n.log 0.88669951 FALSE FALSE TRUE
## D.T.or 1.57635468 FALSE TRUE TRUE
## D.weight.sum.stem.stop.Ratio 34.97536946 FALSE FALSE TRUE
## D.T.damag 0.68965517 FALSE TRUE TRUE
## D.T.been 0.98522167 FALSE TRUE TRUE
## cellular.fctr 0.29556650 FALSE FALSE TRUE
## D.T.this 1.57635468 FALSE TRUE TRUE
## D.T.condit 2.16748768 FALSE TRUE TRUE
## UniqueID 100.00000000 FALSE FALSE FALSE
## D.chrs.pnct07.n.log 0.29556650 FALSE TRUE FALSE
## D.P.gold 0.19704433 FALSE TRUE FALSE
## D.P.spacegray 0.19704433 FALSE TRUE FALSE
## D.chrs.pnct03.n.log 0.19704433 FALSE TRUE FALSE
## D.chrs.pnct18.n.log 0.19704433 FALSE TRUE FALSE
## D.weight.post.stop.sum 36.55172414 FALSE FALSE FALSE
## D.weight.post.stem.sum 36.35467980 FALSE FALSE FALSE
## D.weight.sum 36.35467980 FALSE FALSE FALSE
## D.P.air 0.29556650 FALSE TRUE FALSE
## startprice.dgt1.is9 0.19704433 FALSE FALSE FALSE
## carrier.fctr 0.59113300 FALSE FALSE FALSE
## D.chrs.pnct01.n.log 0.59113300 FALSE TRUE FALSE
## D.T.from 0.98522167 FALSE TRUE FALSE
## D.wrds.n.log 2.26600985 FALSE FALSE FALSE
## D.chrs.uppr.n.log 7.09359606 FALSE FALSE FALSE
## D.T.will 0.88669951 FALSE TRUE FALSE
## D.chrs.n.log 8.96551724 FALSE FALSE FALSE
## startprice.dgt3.is9 0.19704433 FALSE TRUE FALSE
## D.terms.post.stem.n.log 2.06896552 FALSE FALSE FALSE
## D.wrds.unq.n.log 2.06896552 FALSE FALSE FALSE
## prdl.descr.my.fctr 1.97044335 FALSE FALSE FALSE
## D.terms.post.stop.n.log 2.06896552 FALSE FALSE FALSE
## D.chrs.pnct11.n.log 0.68965517 FALSE FALSE FALSE
## startprice.dcm2.is9 0.19704433 FALSE FALSE FALSE
## D.terms.post.stem.n 2.06896552 FALSE FALSE FALSE
## D.chrs.pnct14.n.log 0.49261084 FALSE TRUE FALSE
## D.terms.post.stop.n 2.06896552 FALSE FALSE FALSE
## color.fctr 0.49261084 FALSE FALSE FALSE
## D.chrs.pnct28.n.log 0.29556650 FALSE TRUE FALSE
## D.chrs.pnct13.n.log 0.59113300 FALSE FALSE FALSE
## D.dgts.n.log 0.88669951 FALSE TRUE FALSE
## D.chrs.pnct08.n.log 0.39408867 FALSE TRUE FALSE
## D.chrs.pnct09.n.log 0.39408867 FALSE TRUE FALSE
## D.chrs.pnct12.n.log 0.39408867 FALSE TRUE FALSE
## startprice.dcm1.is9 0.19704433 FALSE FALSE FALSE
## condition.fctr 0.59113300 FALSE FALSE FALSE
## D.T.list 0.39408867 FALSE TRUE FALSE
## startprice.log10.predict 100.00000000 FALSE FALSE FALSE
## D.chrs.pnct05.n.log 0.19704433 FALSE TRUE FALSE
## D.chrs.pnct06.n.log 0.29556650 FALSE TRUE FALSE
## sprice.log10 46.20689655 FALSE FALSE FALSE
## startprice 46.20689655 FALSE FALSE FALSE
## sprice.root2 46.20689655 FALSE FALSE FALSE
## D.P.http 0.09852217 TRUE TRUE NA
## D.chrs.pnct02.n.log 0.09852217 TRUE TRUE NA
## D.chrs.pnct04.n.log 0.09852217 TRUE TRUE NA
## D.chrs.pnct17.n.log 0.09852217 TRUE TRUE NA
## D.chrs.pnct19.n.log 0.09852217 TRUE TRUE NA
## D.chrs.pnct20.n.log 0.09852217 TRUE TRUE NA
## D.chrs.pnct21.n.log 0.09852217 TRUE TRUE NA
## D.chrs.pnct22.n.log 0.09852217 TRUE TRUE NA
## D.chrs.pnct23.n.log 0.09852217 TRUE TRUE NA
## D.chrs.pnct24.n.log 0.09852217 TRUE TRUE NA
## D.chrs.pnct25.n.log 0.09852217 TRUE TRUE NA
## D.chrs.pnct26.n.log 0.09852217 TRUE TRUE NA
## D.chrs.pnct27.n.log 0.09852217 TRUE TRUE NA
## D.chrs.pnct29.n.log 0.09852217 TRUE TRUE NA
## D.chrs.pnct30.n.log 0.09852217 TRUE TRUE NA
## biddable 0.09852217 TRUE TRUE NA
plt_feats_df <- glb_feats_df
print(myplot_scatter(plt_feats_df, "percentUnique", "freqRatio",
colorcol_name="nzv", jitter=TRUE) +
#geom_point(aes(shape=nzv)) +
geom_point() +
xlim(-5, 25) +
geom_hline(yintercept=glb_nzv_freqCut) +
geom_vline(xintercept=glb_nzv_uniqueCut))
## Warning in myplot_scatter(plt_feats_df, "percentUnique", "freqRatio",
## colorcol_name = "nzv", : converting nzv to class:factor
## Warning: Removed 13 rows containing missing values (geom_point).
## Warning: Removed 13 rows containing missing values (geom_point).
## Warning: Removed 13 rows containing missing values (geom_point).
print(subset(glb_feats_df, nzv))
## id cor.y
## D.chrs.pnct15.n.log D.chrs.pnct15.n.log 0.128054077
## D.T.of D.T.of 0.096131208
## D.T.broken D.T.broken 0.091379799
## D.T.it D.T.it 0.090364752
## D.T.cover D.T.cover 0.084037929
## D.chrs.pnct16.n.log D.chrs.pnct16.n.log 0.082266220
## D.T.has D.T.has 0.081745956
## D.T.sign D.T.sign 0.079164120
## D.T.for. D.T.for. 0.070528147
## D.T.part D.T.part 0.062899486
## D.terms.n.stem.stop.Ratio D.terms.n.stem.stop.Ratio 0.061427272
## D.T.some D.T.some 0.044306219
## D.T.tear D.T.tear 0.043092211
## D.T.wear D.T.wear 0.040531333
## D.T.smart D.T.smart 0.021146840
## D.T.devic D.T.devic 0.020563983
## D.T.have D.T.have 0.017801186
## D.P.mini D.P.mini 0.013828986
## D.T.but D.T.but 0.011210903
## D.chrs.pnct10.n.log D.chrs.pnct10.n.log 0.009962551
## D.T.is D.T.is 0.007630386
## D.P.black D.P.black 0.005554954
## D.P.white D.P.white 0.005554954
## D.T.or D.T.or -0.002368073
## D.T.damag D.T.damag -0.005225674
## D.T.been D.T.been -0.005901474
## D.T.this D.T.this -0.008850439
## D.T.condit D.T.condit -0.013082326
## D.chrs.pnct07.n.log D.chrs.pnct07.n.log -0.016366426
## D.P.gold D.P.gold -0.016376061
## D.P.spacegray D.P.spacegray -0.016376061
## D.chrs.pnct03.n.log D.chrs.pnct03.n.log -0.016376061
## D.chrs.pnct18.n.log D.chrs.pnct18.n.log -0.016376061
## D.P.air D.P.air -0.021410325
## D.chrs.pnct01.n.log D.chrs.pnct01.n.log -0.024885276
## D.T.from D.T.from -0.025989943
## D.T.will D.T.will -0.032522829
## startprice.dgt3.is9 startprice.dgt3.is9 -0.032800679
## D.chrs.pnct14.n.log D.chrs.pnct14.n.log -0.037263745
## D.chrs.pnct28.n.log D.chrs.pnct28.n.log -0.039954067
## D.dgts.n.log D.dgts.n.log -0.042077589
## D.chrs.pnct08.n.log D.chrs.pnct08.n.log -0.043039010
## D.chrs.pnct09.n.log D.chrs.pnct09.n.log -0.047112729
## D.chrs.pnct12.n.log D.chrs.pnct12.n.log -0.047796912
## D.T.list D.T.list -0.072729364
## D.chrs.pnct05.n.log D.chrs.pnct05.n.log -0.084210486
## D.chrs.pnct06.n.log D.chrs.pnct06.n.log -0.092298979
## D.P.http D.P.http NA
## D.chrs.pnct02.n.log D.chrs.pnct02.n.log NA
## D.chrs.pnct04.n.log D.chrs.pnct04.n.log NA
## D.chrs.pnct17.n.log D.chrs.pnct17.n.log NA
## D.chrs.pnct19.n.log D.chrs.pnct19.n.log NA
## D.chrs.pnct20.n.log D.chrs.pnct20.n.log NA
## D.chrs.pnct21.n.log D.chrs.pnct21.n.log NA
## D.chrs.pnct22.n.log D.chrs.pnct22.n.log NA
## D.chrs.pnct23.n.log D.chrs.pnct23.n.log NA
## D.chrs.pnct24.n.log D.chrs.pnct24.n.log NA
## D.chrs.pnct25.n.log D.chrs.pnct25.n.log NA
## D.chrs.pnct26.n.log D.chrs.pnct26.n.log NA
## D.chrs.pnct27.n.log D.chrs.pnct27.n.log NA
## D.chrs.pnct29.n.log D.chrs.pnct29.n.log NA
## D.chrs.pnct30.n.log D.chrs.pnct30.n.log NA
## biddable biddable NA
## exclude.as.feat cor.y.abs cor.high.X
## D.chrs.pnct15.n.log 0 0.128054077 <NA>
## D.T.of 0 0.096131208 <NA>
## D.T.broken 0 0.091379799 <NA>
## D.T.it 0 0.090364752 <NA>
## D.T.cover 0 0.084037929 <NA>
## D.chrs.pnct16.n.log 0 0.082266220 <NA>
## D.T.has 0 0.081745956 <NA>
## D.T.sign 0 0.079164120 <NA>
## D.T.for. 0 0.070528147 <NA>
## D.T.part 0 0.062899486 <NA>
## D.terms.n.stem.stop.Ratio 0 0.061427272 <NA>
## D.T.some 0 0.044306219 <NA>
## D.T.tear 0 0.043092211 <NA>
## D.T.wear 0 0.040531333 <NA>
## D.T.smart 0 0.021146840 <NA>
## D.T.devic 0 0.020563983 <NA>
## D.T.have 0 0.017801186 <NA>
## D.P.mini 0 0.013828986 <NA>
## D.T.but 0 0.011210903 <NA>
## D.chrs.pnct10.n.log 0 0.009962551 <NA>
## D.T.is 0 0.007630386 <NA>
## D.P.black 0 0.005554954 <NA>
## D.P.white 0 0.005554954 <NA>
## D.T.or 0 0.002368073 <NA>
## D.T.damag 0 0.005225674 <NA>
## D.T.been 0 0.005901474 <NA>
## D.T.this 0 0.008850439 <NA>
## D.T.condit 0 0.013082326 <NA>
## D.chrs.pnct07.n.log 0 0.016366426 <NA>
## D.P.gold 0 0.016376061 <NA>
## D.P.spacegray 0 0.016376061 <NA>
## D.chrs.pnct03.n.log 0 0.016376061 <NA>
## D.chrs.pnct18.n.log 0 0.016376061 <NA>
## D.P.air 0 0.021410325 <NA>
## D.chrs.pnct01.n.log 0 0.024885276 <NA>
## D.T.from 0 0.025989943 <NA>
## D.T.will 0 0.032522829 <NA>
## startprice.dgt3.is9 0 0.032800679 <NA>
## D.chrs.pnct14.n.log 0 0.037263745 <NA>
## D.chrs.pnct28.n.log 0 0.039954067 <NA>
## D.dgts.n.log 0 0.042077589 D.chrs.pnct05.n.log
## D.chrs.pnct08.n.log 0 0.043039010 <NA>
## D.chrs.pnct09.n.log 0 0.047112729 <NA>
## D.chrs.pnct12.n.log 0 0.047796912 <NA>
## D.T.list 0 0.072729364 <NA>
## D.chrs.pnct05.n.log 0 0.084210486 <NA>
## D.chrs.pnct06.n.log 0 0.092298979 <NA>
## D.P.http 0 NA <NA>
## D.chrs.pnct02.n.log 0 NA <NA>
## D.chrs.pnct04.n.log 0 NA <NA>
## D.chrs.pnct17.n.log 0 NA <NA>
## D.chrs.pnct19.n.log 0 NA <NA>
## D.chrs.pnct20.n.log 0 NA <NA>
## D.chrs.pnct21.n.log 0 NA <NA>
## D.chrs.pnct22.n.log 0 NA <NA>
## D.chrs.pnct23.n.log 0 NA <NA>
## D.chrs.pnct24.n.log 0 NA <NA>
## D.chrs.pnct25.n.log 0 NA <NA>
## D.chrs.pnct26.n.log 0 NA <NA>
## D.chrs.pnct27.n.log 0 NA <NA>
## D.chrs.pnct29.n.log 0 NA <NA>
## D.chrs.pnct30.n.log 0 NA <NA>
## biddable 0 NA <NA>
## freqRatio percentUnique zeroVar nzv
## D.chrs.pnct15.n.log 143.14286 0.29556650 FALSE TRUE
## D.T.of 95.10000 1.77339901 FALSE TRUE
## D.T.broken 251.00000 0.78817734 FALSE TRUE
## D.T.it 137.00000 1.97044335 FALSE TRUE
## D.T.cover 502.00000 0.98522167 FALSE TRUE
## D.chrs.pnct16.n.log 252.75000 0.19704433 FALSE TRUE
## D.T.has 95.10000 1.67487685 FALSE TRUE
## D.T.sign 140.28571 1.28078818 FALSE TRUE
## D.T.for. 198.00000 1.37931034 FALSE TRUE
## D.T.part 251.25000 0.68965517 FALSE TRUE
## D.terms.n.stem.stop.Ratio 54.22222 0.98522167 FALSE TRUE
## D.T.some 195.20000 1.57635468 FALSE TRUE
## D.T.tear 250.25000 0.78817734 FALSE TRUE
## D.T.wear 138.71429 1.28078818 FALSE TRUE
## D.T.smart 336.00000 0.59113300 FALSE TRUE
## D.T.devic 48.25000 1.18226601 FALSE TRUE
## D.T.have 139.71429 1.18226601 FALSE TRUE
## D.P.mini 111.66667 0.29556650 FALSE TRUE
## D.T.but 121.87500 1.28078818 FALSE TRUE
## D.chrs.pnct10.n.log 168.00000 0.29556650 FALSE TRUE
## D.T.is 43.75000 2.85714286 FALSE TRUE
## D.P.black 252.75000 0.19704433 FALSE TRUE
## D.P.white 252.75000 0.19704433 FALSE TRUE
## D.T.or 50.15789 1.57635468 FALSE TRUE
## D.T.damag 251.25000 0.68965517 FALSE TRUE
## D.T.been 140.28571 0.98522167 FALSE TRUE
## D.T.this 54.88235 1.57635468 FALSE TRUE
## D.T.condit 22.22857 2.16748768 FALSE TRUE
## D.chrs.pnct07.n.log 168.00000 0.29556650 FALSE TRUE
## D.P.gold 1014.00000 0.19704433 FALSE TRUE
## D.P.spacegray 1014.00000 0.19704433 FALSE TRUE
## D.chrs.pnct03.n.log 1014.00000 0.19704433 FALSE TRUE
## D.chrs.pnct18.n.log 1014.00000 0.19704433 FALSE TRUE
## D.P.air 143.85714 0.29556650 FALSE TRUE
## D.chrs.pnct01.n.log 61.56250 0.59113300 FALSE TRUE
## D.T.from 51.73684 0.98522167 FALSE TRUE
## D.T.will 249.75000 0.88669951 FALSE TRUE
## startprice.dgt3.is9 252.75000 0.19704433 FALSE TRUE
## D.chrs.pnct14.n.log 25.57895 0.49261084 FALSE TRUE
## D.chrs.pnct28.n.log 252.25000 0.29556650 FALSE TRUE
## D.dgts.n.log 22.41463 0.88669951 FALSE TRUE
## D.chrs.pnct08.n.log 42.47826 0.39408867 FALSE TRUE
## D.chrs.pnct09.n.log 167.66667 0.39408867 FALSE TRUE
## D.chrs.pnct12.n.log 29.24242 0.39408867 FALSE TRUE
## D.T.list 51.94737 0.39408867 FALSE TRUE
## D.chrs.pnct05.n.log 23.16667 0.19704433 FALSE TRUE
## D.chrs.pnct06.n.log 36.40741 0.29556650 FALSE TRUE
## D.P.http 0.00000 0.09852217 TRUE TRUE
## D.chrs.pnct02.n.log 0.00000 0.09852217 TRUE TRUE
## D.chrs.pnct04.n.log 0.00000 0.09852217 TRUE TRUE
## D.chrs.pnct17.n.log 0.00000 0.09852217 TRUE TRUE
## D.chrs.pnct19.n.log 0.00000 0.09852217 TRUE TRUE
## D.chrs.pnct20.n.log 0.00000 0.09852217 TRUE TRUE
## D.chrs.pnct21.n.log 0.00000 0.09852217 TRUE TRUE
## D.chrs.pnct22.n.log 0.00000 0.09852217 TRUE TRUE
## D.chrs.pnct23.n.log 0.00000 0.09852217 TRUE TRUE
## D.chrs.pnct24.n.log 0.00000 0.09852217 TRUE TRUE
## D.chrs.pnct25.n.log 0.00000 0.09852217 TRUE TRUE
## D.chrs.pnct26.n.log 0.00000 0.09852217 TRUE TRUE
## D.chrs.pnct27.n.log 0.00000 0.09852217 TRUE TRUE
## D.chrs.pnct29.n.log 0.00000 0.09852217 TRUE TRUE
## D.chrs.pnct30.n.log 0.00000 0.09852217 TRUE TRUE
## biddable 0.00000 0.09852217 TRUE TRUE
## is.cor.y.abs.low
## D.chrs.pnct15.n.log FALSE
## D.T.of FALSE
## D.T.broken FALSE
## D.T.it FALSE
## D.T.cover FALSE
## D.chrs.pnct16.n.log FALSE
## D.T.has FALSE
## D.T.sign FALSE
## D.T.for. FALSE
## D.T.part FALSE
## D.terms.n.stem.stop.Ratio FALSE
## D.T.some FALSE
## D.T.tear FALSE
## D.T.wear FALSE
## D.T.smart FALSE
## D.T.devic FALSE
## D.T.have FALSE
## D.P.mini FALSE
## D.T.but TRUE
## D.chrs.pnct10.n.log TRUE
## D.T.is TRUE
## D.P.black TRUE
## D.P.white TRUE
## D.T.or TRUE
## D.T.damag TRUE
## D.T.been TRUE
## D.T.this TRUE
## D.T.condit TRUE
## D.chrs.pnct07.n.log FALSE
## D.P.gold FALSE
## D.P.spacegray FALSE
## D.chrs.pnct03.n.log FALSE
## D.chrs.pnct18.n.log FALSE
## D.P.air FALSE
## D.chrs.pnct01.n.log FALSE
## D.T.from FALSE
## D.T.will FALSE
## startprice.dgt3.is9 FALSE
## D.chrs.pnct14.n.log FALSE
## D.chrs.pnct28.n.log FALSE
## D.dgts.n.log FALSE
## D.chrs.pnct08.n.log FALSE
## D.chrs.pnct09.n.log FALSE
## D.chrs.pnct12.n.log FALSE
## D.T.list FALSE
## D.chrs.pnct05.n.log FALSE
## D.chrs.pnct06.n.log FALSE
## D.P.http NA
## D.chrs.pnct02.n.log NA
## D.chrs.pnct04.n.log NA
## D.chrs.pnct17.n.log NA
## D.chrs.pnct19.n.log NA
## D.chrs.pnct20.n.log NA
## D.chrs.pnct21.n.log NA
## D.chrs.pnct22.n.log NA
## D.chrs.pnct23.n.log NA
## D.chrs.pnct24.n.log NA
## D.chrs.pnct25.n.log NA
## D.chrs.pnct26.n.log NA
## D.chrs.pnct27.n.log NA
## D.chrs.pnct29.n.log NA
## D.chrs.pnct30.n.log NA
## biddable NA
tmp_allobs_df <-
glb_allobs_df[, union(setdiff(names(glb_allobs_df), subset(glb_feats_df, nzv)$id),
glb_cluster_entropy_var)]
glb_trnobs_df <- subset(tmp_allobs_df, .src == "Train")
glb_newobs_df <- subset(tmp_allobs_df, .src == "Test")
glb_feats_df$interaction.feat <- NA
for (feat in names(glb_interaction_only_feats_lst))
glb_feats_df[glb_feats_df$id %in% feat, "interaction.feat"] <-
glb_interaction_only_feats_lst[[feat]]
#stop(here"); glb_to_sav(); glb_allobs_df <- sav_allobs_df
indep_vars <- subset(glb_feats_df, !nzv & (exclude.as.feat != 1))[, "id"]
numeric_indep_vars <- indep_vars[!grepl(".fctr", indep_vars, fixed=TRUE)]
glb_feats_df$shapiro.test.p.value <- NA
glb_feats_df[glb_feats_df$id %in% numeric_indep_vars, "shapiro.test.p.value"] <-
sapply(numeric_indep_vars, function(var) shapiro.test(glb_trnobs_df[, var])$p.value)
not_nrml_feats_df <- glb_feats_df %>%
subset(!is.na(shapiro.test.p.value)) %>%
subset((shapiro.test.p.value < 0.05) || (id == ".rnorm")) %>%
arrange(shapiro.test.p.value)
row.names(not_nrml_feats_df) <- not_nrml_feats_df$id
#plt_trnobs_df <- glb_trnobs_df[, c("D.npnct05.log", ".rnorm")]
plt_trnobs_df <- glb_trnobs_df[, c(union(not_nrml_feats_df$id[1:min(5, nrow(not_nrml_feats_df))],
".rnorm"), glb_cluster_entropy_var)]
print(myplot_violin(plt_trnobs_df, setdiff(names(plt_trnobs_df), glb_cluster_entropy_var),
xcol_name = glb_cluster_entropy_var) +
facet_wrap(~variable, scales="free"))
#myplot_histogram(plt_trnobs_df, "D.npnct11.log", fill_col_name="sold", show_stats = TRUE)
myadjust_interaction_feats <- function(vars_vctr) {
for (feat in subset(glb_feats_df, !is.na(interaction.feat))$id)
if (feat %in% vars_vctr)
vars_vctr <- union(setdiff(vars_vctr, feat),
paste0(glb_feats_df[glb_feats_df$id == feat, "interaction.feat"], ":",
feat))
return(vars_vctr)
}
# shd .clusterid.fctr be excluded from this ? or include encoding of glb_category_var:.clusterid.fctr ?
indep_vars <- myadjust_interaction_feats(subset(glb_feats_df,
!nzv & (exclude.as.feat != 1))$id)
myrun_rfe <- function(obs_df, indep_vars, sizes=NULL) {
rfe_obs_df <- myget_vectorized_obs_df(obs_df, glb_rsp_var, indep_vars)
predictors_vctr <- setdiff(names(rfe_obs_df), glb_rsp_var)
if (is.null(sizes))
sizes <- tail(2 ^ (1:as.integer(log2(length(predictors_vctr)))), 5)
rfe_control <- rfeControl(functions = rfFuncs, method = "repeatedcv",
number = glb_rcv_n_folds, repeats = glb_rcv_n_repeats,
verbose = TRUE, returnResamp = "all",
seeds = mygen_seeds(seeds_lst_len =
(glb_rcv_n_folds * glb_rcv_n_repeats) + 1,
seeds_elmnt_lst_len = (length(sizes) + 1)))
set.seed(113)
rfe_results <- rfe(rfe_obs_df[, predictors_vctr],
rfe_obs_df[, glb_rsp_var],
sizes = sizes,
# metric = unlist(strsplit(glbMdlMetricsEval, "[.]"))[2],
# maximize = ifelse(unlist(strsplit(glbMdlMetricsEval, "[.]"))[1] == "max",
# TRUE, FALSE),
rfeControl = rfe_control)
print(rfe_results)
print(predictors(rfe_results))
# print(plot(rfe_results, type=c("g", "o")))
# print(plot(rfe_results))
print(ggplot(rfe_results))
return(rfe_results)
}
rfe_fit_results <- myrun_rfe(glb_fitobs_df, indep_vars, glb_rfe_fit_sizes)
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (3 fold, repeated 3 times)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 47 0.7991 0.13837 0.011086 0.05097
## 48 0.7997 0.13099 0.012114 0.06462
## 49 0.7991 0.15147 0.006807 0.04573
## 50 0.7979 0.15259 0.012709 0.04648
## 51 0.7979 0.14968 0.010120 0.04119
## 52 0.8004 0.14609 0.008965 0.04817
## 53 0.8041 0.16752 0.010527 0.05464 *
## 54 0.8016 0.16486 0.009077 0.04280
## 140 0.7991 0.08836 0.008763 0.05230
##
## The top 5 variables (out of 53):
## spdiff.cut.fctr(-1e+03,-100], spdiff.cut.fctr(-10,-1], spdiff.cut.fctr(-1,0], prdl.descr.my.fctrUnknown#1:.clusterid.fctr3, sprice.d20nexp
##
## [1] "spdiff.cut.fctr(-1e+03,-100]"
## [2] "spdiff.cut.fctr(-10,-1]"
## [3] "spdiff.cut.fctr(-1,0]"
## [4] "prdl.descr.my.fctrUnknown#1:.clusterid.fctr3"
## [5] "sprice.d20nexp"
## [6] "sprice.log10"
## [7] "spdiff.cut.fctr(-100,-10]"
## [8] "sprice.root2"
## [9] "spdiff.cut.fctr(100,1e+03]"
## [10] "condition.fctrNew"
## [11] "prdl.descr.my.fctriPadmini#1:.clusterid.fctr3"
## [12] "D.terms.post.stop.n.log"
## [13] "spdiff.cut.fctr(10,100]"
## [14] "D.terms.post.stem.n.log"
## [15] "D.chrs.n.log"
## [16] "prdl.descr.my.fctriPad3#0"
## [17] "prdl.descr.my.fctriPad1#1:.clusterid.fctr3"
## [18] "D.wrds.unq.n.log"
## [19] "startprice.dcm1.is9"
## [20] "cellular.fctr1"
## [21] "D.ratio.weight.sum.wrds.n"
## [22] "D.ratio.wrds.stop.n.wrds.n"
## [23] "spdiff.cut.fctr(1,10]"
## [24] "prdl.descr.my.fctriPadmini3#0"
## [25] "D.chrs.uppr.n.log"
## [26] "prdl.descr.my.fctriPad4#0"
## [27] "prdl.descr.my.fctriPad1#1"
## [28] "D.wrds.n.log"
## [29] "prdl.descr.my.fctriPadAir2#1:.clusterid.fctr2"
## [30] "D.weight.post.stem.sum"
## [31] "prdl.descr.my.fctriPadAir#1"
## [32] "condition.fctrFor parts or not working"
## [33] "cellular.fctr0:carrier.fctrNone"
## [34] "D.weight.sum"
## [35] "D.chrs.pnct11.n.log"
## [36] "prdl.descr.my.fctriPad2#1:.clusterid.fctr2"
## [37] "D.weight.post.stop.sum"
## [38] "prdl.descr.my.fctriPadmini#1:.clusterid.fctr2"
## [39] "cellular.fctr1:carrier.fctrVerizon"
## [40] "prdl.descr.my.fctrUnknown#1"
## [41] "cellular.fctrUnknown:carrier.fctrUnknown"
## [42] "color.fctrWhite"
## [43] "cellular.fctr1:carrier.fctrUnknown"
## [44] "D.wrds.stop.n.log"
## [45] "prdl.descr.my.fctriPadAir2#1"
## [46] "prdl.descr.my.fctriPadAir2#0"
## [47] "storage.fctr16"
## [48] "startprice.dgt2.is9"
## [49] "prdl.descr.my.fctriPadmini#0"
## [50] "prdl.descr.my.fctriPadmini2#0"
## [51] "prdl.descr.my.fctriPadAir#0"
## [52] "cellular.fctr1:carrier.fctrT-Mobile"
## [53] "cellular.fctrUnknown"
# print(all.equal(rfe_results[-which(names(rfe_results) == "times")],
# sav_rfe_results[-which(names(sav_rfe_results) == "times")]))
# require(mRMRe)
# indep_vars_vctr <- subset(glb_feats_df, !nzv &
# (exclude.as.feat != 1))[, "id"]
# indep_vars_vctr <- setdiff(indep_vars_vctr,
# myfind_fctr_cols_df(glb_trnobs_df[, c(glb_rsp_var, indep_vars_vctr)]))
# tmp_trnobs_df <- glb_trnobs_df[, c(glb_rsp_var, indep_vars_vctr)]
# tmp_trnobs_df$biddable <- as.numeric(tmp_trnobs_df$biddable)
# dd <- mRMR.data(data = tmp_trnobs_df)
# mRMRe.fltr <- mRMR.classic(data = dd, target_indices = c(1), feature_count = 10)
# print(solutions(mRMRe.fltr)[[1]])
# print(apply(solutions(mRMRe.fltr)[[1]], 2, function(x, y) { return(y[x]) },
# y=featureNames(dd)))
# print(featureNames(dd)[solutions(mRMRe.fltr)[[1]]])
# print(mRMRe.fltr@filters); print(mRMRe.fltr@scores)
mycheck_problem_data(glb_allobs_df, featsExclude = glbFeatsExclude,
fctrMaxUniqVals = glbFctrMaxUniqVals, terminate = TRUE)
## [1] "numeric data missing in : "
## sold sold.fctr
## 422 422
## [1] "numeric data w/ 0s in : "
## biddable sold
## 1437 798
## cellular.fctr D.terms.post.stop.n
## 801 762
## D.terms.post.stop.n.log D.weight.post.stop.sum
## 762 762
## D.terms.post.stem.n D.terms.post.stem.n.log
## 762 762
## D.weight.post.stem.sum D.T.condit
## 762 1104
## D.T.is D.T.has
## 1220 1340
## D.T.or D.T.this
## 1350 1309
## D.T.it D.T.of
## 1352 1346
## D.T.some D.T.have
## 1381 1377
## D.T.devic D.T.for.
## 1369 1400
## D.T.wear D.T.but
## 1371 1382
## D.T.been D.T.broken
## 1390 1424
## D.T.will D.T.from
## 1413 1392
## D.T.damag D.T.sign
## 1420 1392
## D.T.list D.T.cover
## 1398 1423
## D.T.part D.T.tear
## 1424 1414
## D.T.smart D.wrds.n.log
## 1429 760
## D.wrds.unq.n.log D.weight.sum
## 762 762
## D.ratio.weight.sum.wrds.n D.chrs.n.log
## 762 760
## D.chrs.uppr.n.log D.dgts.n.log
## 762 1293
## D.chrs.pnct01.n.log D.chrs.pnct02.n.log
## 1397 1437
## D.chrs.pnct03.n.log D.chrs.pnct04.n.log
## 1436 1437
## D.chrs.pnct05.n.log D.chrs.pnct06.n.log
## 1378 1390
## D.chrs.pnct07.n.log D.chrs.pnct08.n.log
## 1420 1378
## D.chrs.pnct09.n.log D.chrs.pnct10.n.log
## 1422 1428
## D.chrs.pnct11.n.log D.chrs.pnct12.n.log
## 1234 1369
## D.chrs.pnct13.n.log D.chrs.pnct14.n.log
## 1007 1383
## D.chrs.pnct15.n.log D.chrs.pnct16.n.log
## 1423 1432
## D.chrs.pnct17.n.log D.chrs.pnct18.n.log
## 1436 1436
## D.chrs.pnct19.n.log D.chrs.pnct20.n.log
## 1437 1437
## D.chrs.pnct21.n.log D.chrs.pnct22.n.log
## 1437 1437
## D.chrs.pnct23.n.log D.chrs.pnct24.n.log
## 1437 1437
## D.chrs.pnct25.n.log D.chrs.pnct26.n.log
## 1437 1437
## D.chrs.pnct27.n.log D.chrs.pnct28.n.log
## 1437 1429
## D.chrs.pnct29.n.log D.chrs.pnct30.n.log
## 1437 1437
## D.wrds.stop.n.log D.P.http
## 1042 1437
## D.P.mini D.P.air
## 1418 1425
## D.P.black D.P.white
## 1432 1432
## D.P.gold D.P.spacegray
## 1436 1435
## startprice.dgt1.is9 startprice.dgt2.is9
## 871 1226
## startprice.dgt3.is9 startprice.dcm1.is9
## 1432 878
## startprice.dcm2.is9
## 996
## [1] "numeric data w/ Infs in : "
## named integer(0)
## [1] "numeric data w/ NaNs in : "
## named integer(0)
## [1] "string data missing in : "
## description condition cellular carrier color storage
## 760 0 0 0 0 0
## productline .grpid descr.my .lcn
## 0 NA 760 422
# glb_allobs_df %>% filter(is.na(Married.fctr)) %>% tbl_df()
# glb_allobs_df %>% count(Married.fctr)
# levels(glb_allobs_df$Married.fctr)
print("glb_feats_df:"); print(dim(glb_feats_df))
## [1] "glb_feats_df:"
## [1] 102 12
sav_feats_df <- glb_feats_df
glb_feats_df <- sav_feats_df
glb_feats_df[, "rsp_var_raw"] <- FALSE
glb_feats_df[glb_feats_df$id == glb_rsp_var_raw, "rsp_var_raw"] <- TRUE
glb_feats_df$exclude.as.feat <- (glb_feats_df$exclude.as.feat == 1)
if (!is.null(glb_id_var) && glb_id_var != ".rownames")
glb_feats_df[glb_feats_df$id %in% glb_id_var, "id_var"] <- TRUE
add_feats_df <- data.frame(id=glb_rsp_var, exclude.as.feat=TRUE, rsp_var=TRUE)
row.names(add_feats_df) <- add_feats_df$id; print(add_feats_df)
## id exclude.as.feat rsp_var
## sold.fctr sold.fctr TRUE TRUE
glb_feats_df <- myrbind_df(glb_feats_df, add_feats_df)
if (glb_id_var != ".rownames")
print(subset(glb_feats_df, rsp_var_raw | rsp_var | id_var)) else
print(subset(glb_feats_df, rsp_var_raw | rsp_var))
## id cor.y exclude.as.feat cor.y.abs cor.high.X
## sold sold 1.00000000 TRUE 1.00000000 <NA>
## UniqueID UniqueID -0.01568743 TRUE 0.01568743 <NA>
## sold.fctr sold.fctr NA TRUE NA <NA>
## freqRatio percentUnique zeroVar nzv is.cor.y.abs.low
## sold 3.677419 0.1970443 FALSE FALSE FALSE
## UniqueID 1.000000 100.0000000 FALSE FALSE FALSE
## sold.fctr NA NA NA NA NA
## interaction.feat shapiro.test.p.value rsp_var_raw id_var rsp_var
## sold <NA> NA TRUE NA NA
## UniqueID <NA> NA FALSE TRUE NA
## sold.fctr <NA> NA NA NA TRUE
print("glb_feats_df vs. glb_allobs_df: ");
## [1] "glb_feats_df vs. glb_allobs_df: "
print(setdiff(glb_feats_df$id, names(glb_allobs_df)))
## character(0)
print("glb_allobs_df vs. glb_feats_df: ");
## [1] "glb_allobs_df vs. glb_feats_df: "
# Ensure these are only chr vars
print(setdiff(setdiff(names(glb_allobs_df), glb_feats_df$id),
myfind_chr_cols_df(glb_allobs_df)))
## character(0)
if (glb_save_envir)
save(glb_feats_df,
glb_allobs_df, #glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
file=paste0(glb_out_pfx, "selfts_dsk.RData"))
# load(paste0(glb_out_pfx, "blddfs_dsk.RData"))
# if (!all.equal(tmp_feats_df, glb_feats_df))
# stop("glb_feats_df r/w not working")
# if (!all.equal(tmp_entity_df, glb_allobs_df))
# stop("glb_allobs_df r/w not working")
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=TRUE)
## label step_major step_minor label_minor bgn end
## 9 select.features 5 0 0 194.566 221.813
## 10 fit.models 6 0 0 221.813 NA
## elapsed
## 9 27.247
## 10 NA
6.0: fit models# load(paste0(glb_out_pfx, "dsk.RData"))
get_model_sel_frmla <- function() {
model_evl_terms <- c(NULL)
# min.aic.fit might not be avl
lclMdlEvlCriteria <-
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)]
for (metric in lclMdlEvlCriteria)
model_evl_terms <- c(model_evl_terms,
ifelse(length(grep("max", metric)) > 0, "-", "+"), metric)
if (glb_is_classification && glb_is_binomial)
model_evl_terms <- c(model_evl_terms, "-", "opt.prob.threshold.OOB")
model_sel_frmla <- as.formula(paste(c("~ ", model_evl_terms), collapse = " "))
return(model_sel_frmla)
}
get_dsp_models_df <- function() {
dsp_models_cols <- c("id",
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)],
grep("opt.", names(glb_models_df), fixed = TRUE, value = TRUE))
dsp_models_df <-
#orderBy(get_model_sel_frmla(), glb_models_df)[, c("id", glbMdlMetricsEval)]
orderBy(get_model_sel_frmla(), glb_models_df)[, dsp_models_cols]
nCvMdl <- sapply(glb_models_lst, function(mdl) nrow(mdl$results))
nParams <- sapply(glb_models_lst, function(mdl) ifelse(mdl$method == "custom", 0,
nrow(subset(modelLookup(mdl$method), parameter != "parameter"))))
# nCvMdl <- nCvMdl[names(nCvMdl) != "avNNet"]
# nParams <- nParams[names(nParams) != "avNNet"]
if (length(cvMdlProblems <- nCvMdl[nCvMdl <= nParams]) > 0) {
print("Cross Validation issues:")
warning("Cross Validation issues:")
print(cvMdlProblems)
}
pltMdls <- setdiff(names(nCvMdl), names(cvMdlProblems))
pltMdls <- setdiff(pltMdls, names(nParams[nParams == 0]))
# length(pltMdls) == 21
png(paste0(glb_out_pfx, "bestTune.png"), width = 480 * 2, height = 480 * 4)
grid.newpage()
pushViewport(viewport(layout = grid.layout(ceiling(length(pltMdls) / 2.0), 2)))
pltIx <- 1
for (mdlId in pltMdls) {
print(ggplot(glb_models_lst[[mdlId]], highBestTune = TRUE) + labs(title = mdlId),
vp = viewport(layout.pos.row = ceiling(pltIx / 2.0),
layout.pos.col = ((pltIx - 1) %% 2) + 1))
pltIx <- pltIx + 1
}
dev.off()
return(dsp_models_df)
}
#get_dsp_models_df()
if (glb_is_classification && glb_is_binomial &&
(length(unique(glb_fitobs_df[, glb_rsp_var])) < 2))
stop("glb_fitobs_df$", glb_rsp_var, ": contains less than 2 unique values: ",
paste0(unique(glb_fitobs_df[, glb_rsp_var]), collapse=", "))
max_cor_y_x_vars <- orderBy(~ -cor.y.abs,
subset(glb_feats_df, (exclude.as.feat == 0) & !nzv & !is.cor.y.abs.low &
is.na(cor.high.X)))[1:2, "id"]
# while(length(max_cor_y_x_vars) < 2) {
# max_cor_y_x_vars <- c(max_cor_y_x_vars, orderBy(~ -cor.y.abs,
# subset(glb_feats_df, (exclude.as.feat == 0) & !is.cor.y.abs.low))[3, "id"])
# }
#stop(here"); glb_to_sav(); glb_models_lst <- sav_models_lst; glb_models_df <- sav_models_df
if (!is.null(glb_Baseline_mdl_var)) {
if ((max_cor_y_x_vars[1] != glb_Baseline_mdl_var) &
(glb_feats_df[glb_feats_df$id == max_cor_y_x_vars[1], "cor.y.abs"] >
glb_feats_df[glb_feats_df$id == glb_Baseline_mdl_var, "cor.y.abs"]))
stop(max_cor_y_x_vars[1], " has a higher correlation with ", glb_rsp_var,
" than the Baseline var: ", glb_Baseline_mdl_var)
}
glb_model_type <- ifelse(glb_is_regression, "regression", "classification")
# Model specs
c("id.prefix", "method", "type",
# trainControl params
"preProc.method", "cv.n.folds", "cv.n.repeats", "summary.fn",
# train params
"metric", "metric.maximize", "tune.df")
## [1] "id.prefix" "method" "type"
## [4] "preProc.method" "cv.n.folds" "cv.n.repeats"
## [7] "summary.fn" "metric" "metric.maximize"
## [10] "tune.df"
# Baseline
if (!is.null(glb_Baseline_mdl_var))
ret_lst <- myfit_mdl(mdl_id="Baseline",
model_method="mybaseln_classfr",
indep_vars_vctr=glb_Baseline_mdl_var,
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
# Most Frequent Outcome "MFO" model: mean(y) for regression
# Not using caret's nullModel since model stats not avl
# Cannot use rpart for multinomial classification since it predicts non-MFO
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "MFO", type = glb_model_type, trainControl.method = "none",
train.method = ifelse(glb_is_regression, "lm", "myMFO_classfr"))),
indep_vars = ".rnorm", rsp_var = glb_rsp_var,
fit_df = glb_fitobs_df, OOB_df = glb_OOBobs_df)
## [1] "fitting model: MFO.myMFO_classfr"
## [1] " indep_vars: .rnorm"
## Warning in if (mdl_specs_lst[["train.method"]] %in% c("bayesglm", "glm", :
## the condition has length > 1 and only the first element will be used
## Fitting parameter = none on full training set
## [1] "in MFO.Classifier$fit"
## [1] "unique.vals:"
## [1] N Y
## Levels: N Y
## [1] "unique.prob:"
## y
## N Y
## 0.7984934 0.2015066
## [1] "MFO.val:"
## [1] "N"
## Length Class Mode
## unique.vals 2 factor numeric
## unique.prob 2 -none- numeric
## MFO.val 1 -none- character
## x.names 1 -none- character
## xNames 1 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## Loading required namespace: pROC
## [1] "entr MFO.Classifier$predict"
## [1] "exit MFO.Classifier$predict"
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
##
## The following object is masked from 'package:wordcloud':
##
## textplot
##
## The following object is masked from 'package:stats':
##
## lowess
## [1] "in MFO.Classifier$prob"
## N Y
## 1 0.7984934 0.2015066
## 2 0.7984934 0.2015066
## 3 0.7984934 0.2015066
## 4 0.7984934 0.2015066
## 5 0.7984934 0.2015066
## 6 0.7984934 0.2015066
## sold.fctr sold.fctr.predict.MFO.myMFO_classfr.Y
## 1 N 424
## 2 Y 107
## Prediction
## Reference N Y
## N 0 424
## Y 0 107
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 2.015066e-01 0.000000e+00 1.681880e-01 2.381872e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 8.945646e-94
## [1] "entr MFO.Classifier$predict"
## [1] "exit MFO.Classifier$predict"
## [1] "in MFO.Classifier$prob"
## N Y
## 1 0.7984934 0.2015066
## 2 0.7984934 0.2015066
## 3 0.7984934 0.2015066
## 4 0.7984934 0.2015066
## 5 0.7984934 0.2015066
## 6 0.7984934 0.2015066
## sold.fctr sold.fctr.predict.MFO.myMFO_classfr.Y
## 1 N 374
## 2 Y 110
## Prediction
## Reference N Y
## N 0 374
## Y 0 110
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 2.272727e-01 0.000000e+00 1.906680e-01 2.672469e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 6.857299e-83
## id feats max.nTuningRuns min.elapsedtime.everything
## 1 MFO.myMFO_classfr .rnorm 0 0.277
## min.elapsedtime.final max.AUCpROC.fit max.Sens.fit max.Spec.fit
## 1 0.003 0.5 1 0
## max.AUCROCR.fit opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.5 0.2 0.3354232 0.2015066
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.168188 0.2381872 0
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5 1 0 0.5
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.3703704 0.2272727
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.190668 0.2672469 0
if (glb_is_classification)
# "random" model - only for classification;
# none needed for regression since it is same as MFO
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = "Random", type = glb_model_type, trainControl.method = "none",
train.method = "myrandom_classfr")),
indep_vars = ".rnorm", rsp_var = glb_rsp_var,
fit_df = glb_fitobs_df, OOB_df = glb_OOBobs_df)
## [1] "fitting model: Random.myrandom_classfr"
## [1] " indep_vars: .rnorm"
## Warning in if (mdl_specs_lst[["train.method"]] %in% c("bayesglm", "glm", :
## the condition has length > 1 and only the first element will be used
## Fitting parameter = none on full training set
## Length Class Mode
## unique.vals 2 factor numeric
## unique.prob 2 table numeric
## xNames 1 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## [1] "in Random.Classifier$prob"
## sold.fctr sold.fctr.predict.Random.myrandom_classfr.Y
## 1 N 424
## 2 Y 107
## Prediction
## Reference N Y
## N 0 424
## Y 0 107
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 2.015066e-01 0.000000e+00 1.681880e-01 2.381872e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 8.945646e-94
## [1] "in Random.Classifier$prob"
## sold.fctr sold.fctr.predict.Random.myrandom_classfr.Y
## 1 N 374
## 2 Y 110
## Prediction
## Reference N Y
## N 0 374
## Y 0 110
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 2.272727e-01 0.000000e+00 1.906680e-01 2.672469e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 6.857299e-83
## id feats max.nTuningRuns
## 1 Random.myrandom_classfr .rnorm 0
## min.elapsedtime.everything min.elapsedtime.final max.AUCpROC.fit
## 1 0.287 0.002 0.5282909
## max.Sens.fit max.Spec.fit max.AUCROCR.fit opt.prob.threshold.fit
## 1 0.8042453 0.2523364 0.5050586 0.2
## max.f.score.fit max.Accuracy.fit max.AccuracyLower.fit
## 1 0.3354232 0.2015066 0.168188
## max.AccuracyUpper.fit max.Kappa.fit max.AUCpROC.OOB max.Sens.OOB
## 1 0.2381872 0 0.5168449 0.815508
## max.Spec.OOB max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB
## 1 0.2181818 0.4962567 0.2 0.3703704
## max.Accuracy.OOB max.AccuracyLower.OOB max.AccuracyUpper.OOB
## 1 0.2272727 0.190668 0.2672469
## max.Kappa.OOB
## 1 0
# ret_lst <- myfit_mdl(mdl_id = "Random", model_method = "myrandom_classfr",
# model_type = glb_model_type,
# indep_vars_vctr = ".rnorm",
# rsp_var = glb_rsp_var, rsp_var_out = glb_rsp_var_out,
# fit_df = glb_fitobs_df, OOB_df = glb_OOBobs_df)
# Any models that have tuning parameters has "better" results with cross-validation
# (except bag & rf) & "different" results for different outcome metrics
# Max.cor.Y
# Check impact of cv
# rpart is not a good candidate since caret does not optimize cp (only tuning parameter of rpart) well
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Max.cor.Y.rcv.1X1", type=glb_model_type, trainControl.method="none",
train.method="glmnet")),
indep_vars=max_cor_y_x_vars, rsp_var=glb_rsp_var,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
## [1] "fitting model: Max.cor.Y.rcv.1X1.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.root2"
## Loading required package: glmnet
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following object is masked from 'package:tidyr':
##
## expand
##
## Loaded glmnet 2.0-2
## Fitting alpha = 0.1, lambda = 0.00175 on full training set
## Length Class Mode
## a0 96 -none- numeric
## beta 768 dgCMatrix S4
## df 96 -none- numeric
## dim 2 -none- numeric
## lambda 96 -none- numeric
## dev.ratio 96 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 8 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-100,-10]
## -2.9190137 1.9085131
## spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## 3.4381308 7.5473894
## spdiff.cut.fctr(0,1] spdiff.cut.fctr(1,10]
## 3.2635976 2.8847349
## spdiff.cut.fctr(10,100] spdiff.cut.fctr(100,1e+03]
## 2.6158061 0.9264224
## sprice.root2
## -0.0394647
## [1] "max lambda < lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-100,-10]
## -2.97511399 1.95697679
## spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## 3.48869407 7.67710579
## spdiff.cut.fctr(0,1] spdiff.cut.fctr(1,10]
## 3.31726627 2.93595250
## spdiff.cut.fctr(10,100] spdiff.cut.fctr(100,1e+03]
## 2.66542761 0.97921940
## sprice.root2
## -0.03892456
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.glmnet.N
## 1 N 308
## 2 Y 37
## sold.fctr.predict.Max.cor.Y.rcv.1X1.glmnet.Y
## 1 116
## 2 70
## Prediction
## Reference N Y
## N 308 116
## Y 37 70
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.118644e-01 2.982924e-01 6.712958e-01 7.500560e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 9.999993e-01 2.864754e-10
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.glmnet.N
## 1 N 252
## 2 Y 29
## sold.fctr.predict.Max.cor.Y.rcv.1X1.glmnet.Y
## 1 122
## 2 81
## Prediction
## Reference N Y
## N 252 122
## Y 29 81
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.880165e-01 3.158979e-01 6.446506e-01 7.290671e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 9.999929e-01 7.054420e-14
## id feats max.nTuningRuns
## 1 Max.cor.Y.rcv.1X1.glmnet spdiff.cut.fctr,sprice.root2 0
## min.elapsedtime.everything min.elapsedtime.final max.AUCpROC.fit
## 1 0.722 0.032 0.5653324
## max.Sens.fit max.Spec.fit max.AUCROCR.fit opt.prob.threshold.fit
## 1 0.9811321 0.1495327 0.7580453 0.2
## max.f.score.fit max.Accuracy.fit max.AccuracyLower.fit
## 1 0.4778157 0.7118644 0.6712958
## max.AccuracyUpper.fit max.Kappa.fit max.AUCpROC.OOB max.Sens.OOB
## 1 0.750056 0.2982924 0.5529412 0.9786096
## max.Spec.OOB max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB
## 1 0.1272727 0.7423189 0.2 0.5175719
## max.Accuracy.OOB max.AccuracyLower.OOB max.AccuracyUpper.OOB
## 1 0.6880165 0.6446506 0.7290671
## max.Kappa.OOB
## 1 0.3158979
# rcv_n_folds == 1 & rcv_n_repeats > 1 crashes
for (rcv_n_folds in seq(3, glb_rcv_n_folds + 2, 2))
for (rcv_n_repeats in seq(1, glb_rcv_n_repeats + 2, 2)) {
ret_lst <- myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = paste0("Max.cor.Y.rcv.", rcv_n_folds, "X", rcv_n_repeats),
type = glb_model_type, trainControl.method = "repeatedcv",
trainControl.number = rcv_n_folds, trainControl.repeats = rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.method = "glmnet", train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize)),
indep_vars = max_cor_y_x_vars, rsp_var = glb_rsp_var,
fit_df = glb_fitobs_df, OOB_df = glb_OOBobs_df)
}
## [1] "fitting model: Max.cor.Y.rcv.3X1.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.root2"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.0377 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst =
## list(id.prefix = paste0("Max.cor.Y.rcv.", : model's bestTune found at an
## extreme of tuneGrid for parameter: lambda
## Length Class Mode
## a0 74 -none- numeric
## beta 592 dgCMatrix S4
## df 74 -none- numeric
## dim 2 -none- numeric
## lambda 74 -none- numeric
## dev.ratio 74 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 8 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## -1.03165954 1.06135700 2.07411021
## spdiff.cut.fctr(1,10] spdiff.cut.fctr(10,100] sprice.root2
## 0.43394590 0.34905684 -0.03670655
## [1] "max lambda < lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## -1.03888845 1.13028806 2.22933838
## spdiff.cut.fctr(1,10] spdiff.cut.fctr(10,100] sprice.root2
## 0.51041350 0.41018734 -0.03819162
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.3X1.glmnet.N
## 1 N 306
## 2 Y 39
## sold.fctr.predict.Max.cor.Y.rcv.3X1.glmnet.Y
## 1 118
## 2 68
## Prediction
## Reference N Y
## N 306 118
## Y 39 68
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.043315e-01 2.799471e-01 6.635135e-01 7.428574e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 9.999999e-01 4.813174e-10
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.3X1.glmnet.N
## 1 N 254
## 2 Y 36
## sold.fctr.predict.Max.cor.Y.rcv.3X1.glmnet.Y
## 1 120
## 2 74
## Prediction
## Reference N Y
## N 254 120
## Y 36 74
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.776860e-01 2.771693e-01 6.340242e-01 7.191601e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 9.999994e-01 3.025623e-11
## id feats max.nTuningRuns
## 1 Max.cor.Y.rcv.3X1.glmnet spdiff.cut.fctr,sprice.root2 25
## min.elapsedtime.everything min.elapsedtime.final max.AUCpROC.fit
## 1 1.469 0.026 0.5233645
## max.Sens.fit max.Spec.fit max.AUCROCR.fit opt.prob.threshold.fit
## 1 1 0.04672897 0.7289279 0.2
## max.f.score.fit max.Accuracy.fit max.AccuracyLower.fit
## 1 0.4641638 0.8098207 0.6635135
## max.AccuracyUpper.fit max.Kappa.fit max.AUCpROC.OOB max.Sens.OOB
## 1 0.7428574 0.0866275 0.5187166 0.9919786
## max.Spec.OOB max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB
## 1 0.04545455 0.7148517 0.2 0.4868421
## max.Accuracy.OOB max.AccuracyLower.OOB max.AccuracyUpper.OOB
## 1 0.677686 0.6340242 0.7191601
## max.Kappa.OOB max.AccuracySD.fit max.KappaSD.fit
## 1 0.2771693 0.007588213 0.04350633
## [1] "fitting model: Max.cor.Y.rcv.3X3.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.root2"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.0377 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst =
## list(id.prefix = paste0("Max.cor.Y.rcv.", : model's bestTune found at an
## extreme of tuneGrid for parameter: lambda
## Length Class Mode
## a0 74 -none- numeric
## beta 592 dgCMatrix S4
## df 74 -none- numeric
## dim 2 -none- numeric
## lambda 74 -none- numeric
## dev.ratio 74 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 8 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## -1.03165954 1.06135700 2.07411021
## spdiff.cut.fctr(1,10] spdiff.cut.fctr(10,100] sprice.root2
## 0.43394590 0.34905684 -0.03670655
## [1] "max lambda < lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## -1.03888845 1.13028806 2.22933838
## spdiff.cut.fctr(1,10] spdiff.cut.fctr(10,100] sprice.root2
## 0.51041350 0.41018734 -0.03819162
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.3X3.glmnet.N
## 1 N 306
## 2 Y 39
## sold.fctr.predict.Max.cor.Y.rcv.3X3.glmnet.Y
## 1 118
## 2 68
## Prediction
## Reference N Y
## N 306 118
## Y 39 68
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.043315e-01 2.799471e-01 6.635135e-01 7.428574e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 9.999999e-01 4.813174e-10
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.3X3.glmnet.N
## 1 N 254
## 2 Y 36
## sold.fctr.predict.Max.cor.Y.rcv.3X3.glmnet.Y
## 1 120
## 2 74
## Prediction
## Reference N Y
## N 254 120
## Y 36 74
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.776860e-01 2.771693e-01 6.340242e-01 7.191601e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 9.999994e-01 3.025623e-11
## id feats max.nTuningRuns
## 1 Max.cor.Y.rcv.3X3.glmnet spdiff.cut.fctr,sprice.root2 25
## min.elapsedtime.everything min.elapsedtime.final max.AUCpROC.fit
## 1 2.097 0.024 0.5233645
## max.Sens.fit max.Spec.fit max.AUCROCR.fit opt.prob.threshold.fit
## 1 1 0.04672897 0.7289279 0.2
## max.f.score.fit max.Accuracy.fit max.AccuracyLower.fit
## 1 0.4641638 0.8110598 0.6635135
## max.AccuracyUpper.fit max.Kappa.fit max.AUCpROC.OOB max.Sens.OOB
## 1 0.7428574 0.09445917 0.5187166 0.9919786
## max.Spec.OOB max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB
## 1 0.04545455 0.7148517 0.2 0.4868421
## max.Accuracy.OOB max.AccuracyLower.OOB max.AccuracyUpper.OOB
## 1 0.677686 0.6340242 0.7191601
## max.Kappa.OOB max.AccuracySD.fit max.KappaSD.fit
## 1 0.2771693 0.008672284 0.06210202
## [1] "fitting model: Max.cor.Y.rcv.3X5.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.root2"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.0377 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst =
## list(id.prefix = paste0("Max.cor.Y.rcv.", : model's bestTune found at an
## extreme of tuneGrid for parameter: lambda
## Length Class Mode
## a0 74 -none- numeric
## beta 592 dgCMatrix S4
## df 74 -none- numeric
## dim 2 -none- numeric
## lambda 74 -none- numeric
## dev.ratio 74 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 8 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## -1.03165954 1.06135700 2.07411021
## spdiff.cut.fctr(1,10] spdiff.cut.fctr(10,100] sprice.root2
## 0.43394590 0.34905684 -0.03670655
## [1] "max lambda < lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## -1.03888845 1.13028806 2.22933838
## spdiff.cut.fctr(1,10] spdiff.cut.fctr(10,100] sprice.root2
## 0.51041350 0.41018734 -0.03819162
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.3X5.glmnet.N
## 1 N 306
## 2 Y 39
## sold.fctr.predict.Max.cor.Y.rcv.3X5.glmnet.Y
## 1 118
## 2 68
## Prediction
## Reference N Y
## N 306 118
## Y 39 68
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.043315e-01 2.799471e-01 6.635135e-01 7.428574e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 9.999999e-01 4.813174e-10
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.3X5.glmnet.N
## 1 N 254
## 2 Y 36
## sold.fctr.predict.Max.cor.Y.rcv.3X5.glmnet.Y
## 1 120
## 2 74
## Prediction
## Reference N Y
## N 254 120
## Y 36 74
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.776860e-01 2.771693e-01 6.340242e-01 7.191601e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 9.999994e-01 3.025623e-11
## id feats max.nTuningRuns
## 1 Max.cor.Y.rcv.3X5.glmnet spdiff.cut.fctr,sprice.root2 25
## min.elapsedtime.everything min.elapsedtime.final max.AUCpROC.fit
## 1 2.487 0.026 0.5233645
## max.Sens.fit max.Spec.fit max.AUCROCR.fit opt.prob.threshold.fit
## 1 1 0.04672897 0.7289279 0.2
## max.f.score.fit max.Accuracy.fit max.AccuracyLower.fit
## 1 0.4641638 0.8098088 0.6635135
## max.AccuracyUpper.fit max.Kappa.fit max.AUCpROC.OOB max.Sens.OOB
## 1 0.7428574 0.08571096 0.5187166 0.9919786
## max.Spec.OOB max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB
## 1 0.04545455 0.7148517 0.2 0.4868421
## max.Accuracy.OOB max.AccuracyLower.OOB max.AccuracyUpper.OOB
## 1 0.677686 0.6340242 0.7191601
## max.Kappa.OOB max.AccuracySD.fit max.KappaSD.fit
## 1 0.2771693 0.007730242 0.05270539
## [1] "fitting model: Max.cor.Y.rcv.5X1.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.root2"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.55, lambda = 0.00812 on full training set
## Length Class Mode
## a0 77 -none- numeric
## beta 616 dgCMatrix S4
## df 77 -none- numeric
## dim 2 -none- numeric
## lambda 77 -none- numeric
## dev.ratio 77 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 8 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-100,-10]
## -1.76500819 0.85025858
## spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## 2.32618922 4.86968673
## spdiff.cut.fctr(0,1] spdiff.cut.fctr(1,10]
## 1.87614108 1.75357654
## spdiff.cut.fctr(10,100] sprice.root2
## 1.53358399 -0.04816437
## [1] "max lambda < lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-100,-10]
## -1.81305070 0.90535740
## spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## 2.38450212 5.01047282
## spdiff.cut.fctr(0,1] spdiff.cut.fctr(1,10]
## 1.95616980 1.81203035
## spdiff.cut.fctr(10,100] sprice.root2
## 1.58966923 -0.04830913
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.5X1.glmnet.N
## 1 N 308
## 2 Y 37
## sold.fctr.predict.Max.cor.Y.rcv.5X1.glmnet.Y
## 1 116
## 2 70
## Prediction
## Reference N Y
## N 308 116
## Y 37 70
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.118644e-01 2.982924e-01 6.712958e-01 7.500560e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 9.999993e-01 2.864754e-10
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.5X1.glmnet.N
## 1 N 250
## 2 Y 31
## sold.fctr.predict.Max.cor.Y.rcv.5X1.glmnet.Y
## 1 124
## 2 79
## Prediction
## Reference N Y
## N 250 124
## Y 31 79
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.797521e-01 2.977759e-01 6.361476e-01 7.211434e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 9.999990e-01 1.472514e-13
## id feats max.nTuningRuns
## 1 Max.cor.Y.rcv.5X1.glmnet spdiff.cut.fctr,sprice.root2 25
## min.elapsedtime.everything min.elapsedtime.final max.AUCpROC.fit
## 1 1.685 0.025 0.5536722
## max.Sens.fit max.Spec.fit max.AUCROCR.fit opt.prob.threshold.fit
## 1 0.9858491 0.1214953 0.7585302 0.2
## max.f.score.fit max.Accuracy.fit max.AccuracyLower.fit
## 1 0.4778157 0.8154583 0.6712958
## max.AccuracyUpper.fit max.Kappa.fit max.AUCpROC.OOB max.Sens.OOB
## 1 0.750056 0.1627272 0.5497326 0.9812834
## max.Spec.OOB max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB
## 1 0.1181818 0.7398882 0.2 0.5047923
## max.Accuracy.OOB max.AccuracyLower.OOB max.AccuracyUpper.OOB
## 1 0.6797521 0.6361476 0.7211434
## max.Kappa.OOB max.AccuracySD.fit max.KappaSD.fit
## 1 0.2977759 0.004163174 0.04402859
## [1] "fitting model: Max.cor.Y.rcv.5X3.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.root2"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.55, lambda = 0.00812 on full training set
## Length Class Mode
## a0 77 -none- numeric
## beta 616 dgCMatrix S4
## df 77 -none- numeric
## dim 2 -none- numeric
## lambda 77 -none- numeric
## dev.ratio 77 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 8 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-100,-10]
## -1.76500819 0.85025858
## spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## 2.32618922 4.86968673
## spdiff.cut.fctr(0,1] spdiff.cut.fctr(1,10]
## 1.87614108 1.75357654
## spdiff.cut.fctr(10,100] sprice.root2
## 1.53358399 -0.04816437
## [1] "max lambda < lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-100,-10]
## -1.81305070 0.90535740
## spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## 2.38450212 5.01047282
## spdiff.cut.fctr(0,1] spdiff.cut.fctr(1,10]
## 1.95616980 1.81203035
## spdiff.cut.fctr(10,100] sprice.root2
## 1.58966923 -0.04830913
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.5X3.glmnet.N
## 1 N 308
## 2 Y 37
## sold.fctr.predict.Max.cor.Y.rcv.5X3.glmnet.Y
## 1 116
## 2 70
## Prediction
## Reference N Y
## N 308 116
## Y 37 70
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.118644e-01 2.982924e-01 6.712958e-01 7.500560e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 9.999993e-01 2.864754e-10
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.5X3.glmnet.N
## 1 N 250
## 2 Y 31
## sold.fctr.predict.Max.cor.Y.rcv.5X3.glmnet.Y
## 1 124
## 2 79
## Prediction
## Reference N Y
## N 250 124
## Y 31 79
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.797521e-01 2.977759e-01 6.361476e-01 7.211434e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 9.999990e-01 1.472514e-13
## id feats max.nTuningRuns
## 1 Max.cor.Y.rcv.5X3.glmnet spdiff.cut.fctr,sprice.root2 25
## min.elapsedtime.everything min.elapsedtime.final max.AUCpROC.fit
## 1 2.469 0.024 0.5536722
## max.Sens.fit max.Spec.fit max.AUCROCR.fit opt.prob.threshold.fit
## 1 0.9858491 0.1214953 0.7585302 0.2
## max.f.score.fit max.Accuracy.fit max.AccuracyLower.fit
## 1 0.4778157 0.8123155 0.6712958
## max.AccuracyUpper.fit max.Kappa.fit max.AUCpROC.OOB max.Sens.OOB
## 1 0.750056 0.1464828 0.5497326 0.9812834
## max.Spec.OOB max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB
## 1 0.1181818 0.7398882 0.2 0.5047923
## max.Accuracy.OOB max.AccuracyLower.OOB max.AccuracyUpper.OOB
## 1 0.6797521 0.6361476 0.7211434
## max.Kappa.OOB max.AccuracySD.fit max.KappaSD.fit
## 1 0.2977759 0.01330353 0.07756812
## [1] "fitting model: Max.cor.Y.rcv.5X5.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.root2"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.55, lambda = 0.00812 on full training set
## Length Class Mode
## a0 77 -none- numeric
## beta 616 dgCMatrix S4
## df 77 -none- numeric
## dim 2 -none- numeric
## lambda 77 -none- numeric
## dev.ratio 77 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 8 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-100,-10]
## -1.76500819 0.85025858
## spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## 2.32618922 4.86968673
## spdiff.cut.fctr(0,1] spdiff.cut.fctr(1,10]
## 1.87614108 1.75357654
## spdiff.cut.fctr(10,100] sprice.root2
## 1.53358399 -0.04816437
## [1] "max lambda < lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-100,-10]
## -1.81305070 0.90535740
## spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## 2.38450212 5.01047282
## spdiff.cut.fctr(0,1] spdiff.cut.fctr(1,10]
## 1.95616980 1.81203035
## spdiff.cut.fctr(10,100] sprice.root2
## 1.58966923 -0.04830913
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.5X5.glmnet.N
## 1 N 308
## 2 Y 37
## sold.fctr.predict.Max.cor.Y.rcv.5X5.glmnet.Y
## 1 116
## 2 70
## Prediction
## Reference N Y
## N 308 116
## Y 37 70
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.118644e-01 2.982924e-01 6.712958e-01 7.500560e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 9.999993e-01 2.864754e-10
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.5X5.glmnet.N
## 1 N 250
## 2 Y 31
## sold.fctr.predict.Max.cor.Y.rcv.5X5.glmnet.Y
## 1 124
## 2 79
## Prediction
## Reference N Y
## N 250 124
## Y 31 79
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.797521e-01 2.977759e-01 6.361476e-01 7.211434e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 9.999990e-01 1.472514e-13
## id feats max.nTuningRuns
## 1 Max.cor.Y.rcv.5X5.glmnet spdiff.cut.fctr,sprice.root2 25
## min.elapsedtime.everything min.elapsedtime.final max.AUCpROC.fit
## 1 3.077 0.024 0.5536722
## max.Sens.fit max.Spec.fit max.AUCROCR.fit opt.prob.threshold.fit
## 1 0.9858491 0.1214953 0.7585302 0.2
## max.f.score.fit max.Accuracy.fit max.AccuracyLower.fit
## 1 0.4778157 0.8101741 0.6712958
## max.AccuracyUpper.fit max.Kappa.fit max.AUCpROC.OOB max.Sens.OOB
## 1 0.750056 0.1450438 0.5497326 0.9812834
## max.Spec.OOB max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB
## 1 0.1181818 0.7398882 0.2 0.5047923
## max.Accuracy.OOB max.AccuracyLower.OOB max.AccuracyUpper.OOB
## 1 0.6797521 0.6361476 0.7211434
## max.Kappa.OOB max.AccuracySD.fit max.KappaSD.fit
## 1 0.2977759 0.01366974 0.07197471
# Add parallel coordinates graph of glb_models_df[, glbMdlMetricsEval] to evaluate cv parameters
tmp_models_cols <- c("id", "max.nTuningRuns",
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)],
grep("opt.", names(glb_models_df), fixed = TRUE, value = TRUE))
print(myplot_parcoord(obs_df = subset(glb_models_df,
grepl("Max.cor.Y.rcv.", id, fixed = TRUE),
select = -feats)[, tmp_models_cols],
id_var = "id"))
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Max.cor.Y.rcv.1X1.cp.0", type=glb_model_type, trainControl.method="none",
train.method="rpart",
tune.df=data.frame(method="rpart", parameter="cp", min=0.0, max=0.0, by=0.1))),
indep_vars=max_cor_y_x_vars, rsp_var=glb_rsp_var,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
## [1] "fitting model: Max.cor.Y.rcv.1X1.cp.0.rpart"
## [1] " indep_vars: spdiff.cut.fctr,sprice.root2"
## Loading required package: rpart
## Fitting cp = 0 on full training set
## Loading required package: rpart.plot
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 531
##
## CP nsplit rel error
## 1 0.028037383 0 1.0000000
## 2 0.007009346 2 0.9439252
## 3 0.003115265 12 0.8504673
## 4 0.000000000 15 0.8411215
##
## Variable importance
## sprice.root2 spdiff.cut.fctr(-10,-1]
## 49 22
## spdiff.cut.fctr(100,1e+03] spdiff.cut.fctr(1,10]
## 9 8
## spdiff.cut.fctr(10,100] spdiff.cut.fctr(-100,-10]
## 7 6
##
## Node number 1: 531 observations, complexity param=0.02803738
## predicted class=N expected loss=0.2015066 P(node) =1
## class counts: 424 107
## probabilities: 0.798 0.202
## left son=2 (489 obs) right son=3 (42 obs)
## Primary splits:
## spdiff.cut.fctr(-10,-1] < 0.5 to the left, improve=8.127078, (0 missing)
## sprice.root2 < 15.49177 to the right, improve=7.398770, (0 missing)
## spdiff.cut.fctr(1,10] < 0.5 to the left, improve=2.488378, (0 missing)
## spdiff.cut.fctr(10,100] < 0.5 to the left, improve=2.142471, (0 missing)
## spdiff.cut.fctr(-100,-10] < 0.5 to the right, improve=1.046415, (0 missing)
##
## Node number 2: 489 observations, complexity param=0.007009346
## predicted class=N expected loss=0.1758691 P(node) =0.920904
## class counts: 403 86
## probabilities: 0.824 0.176
## left son=4 (279 obs) right son=5 (210 obs)
## Primary splits:
## sprice.root2 < 15.57156 to the right, improve=4.8624420, (0 missing)
## spdiff.cut.fctr(10,100] < 0.5 to the left, improve=3.5327020, (0 missing)
## spdiff.cut.fctr(1,10] < 0.5 to the left, improve=3.2831590, (0 missing)
## spdiff.cut.fctr(100,1e+03] < 0.5 to the right, improve=0.5612417, (0 missing)
## spdiff.cut.fctr(-100,-10] < 0.5 to the right, improve=0.1392625, (0 missing)
## Surrogate splits:
## spdiff.cut.fctr(10,100] < 0.5 to the left, agree=0.628, adj=0.133, (0 split)
## spdiff.cut.fctr(1,10] < 0.5 to the left, agree=0.626, adj=0.129, (0 split)
## spdiff.cut.fctr(100,1e+03] < 0.5 to the left, agree=0.616, adj=0.105, (0 split)
## spdiff.cut.fctr(-1,0] < 0.5 to the left, agree=0.581, adj=0.024, (0 split)
## spdiff.cut.fctr(0,1] < 0.5 to the left, agree=0.575, adj=0.010, (0 split)
##
## Node number 3: 42 observations, complexity param=0.02803738
## predicted class=N expected loss=0.5 P(node) =0.07909605
## class counts: 21 21
## probabilities: 0.500 0.500
## left son=6 (18 obs) right son=7 (24 obs)
## Primary splits:
## sprice.root2 < 14.22432 to the right, improve=1.75, (0 missing)
##
## Node number 4: 279 observations
## predicted class=N expected loss=0.1146953 P(node) =0.5254237
## class counts: 247 32
## probabilities: 0.885 0.115
##
## Node number 5: 210 observations, complexity param=0.007009346
## predicted class=N expected loss=0.2571429 P(node) =0.3954802
## class counts: 156 54
## probabilities: 0.743 0.257
## left son=10 (74 obs) right son=11 (136 obs)
## Primary splits:
## spdiff.cut.fctr(-100,-10] < 0.5 to the right, improve=2.0616400, (0 missing)
## spdiff.cut.fctr(1,10] < 0.5 to the left, improve=1.6787120, (0 missing)
## spdiff.cut.fctr(100,1e+03] < 0.5 to the right, improve=1.4538460, (0 missing)
## sprice.root2 < 10.23938 to the right, improve=1.3747700, (0 missing)
## spdiff.cut.fctr(10,100] < 0.5 to the left, improve=0.6548753, (0 missing)
## Surrogate splits:
## sprice.root2 < 14.98331 to the right, agree=0.671, adj=0.068, (0 split)
## spdiff.cut.fctr(10,100] < 0.5 to the left, agree=0.652, adj=0.014, (0 split)
##
## Node number 6: 18 observations
## predicted class=N expected loss=0.3333333 P(node) =0.03389831
## class counts: 12 6
## probabilities: 0.667 0.333
##
## Node number 7: 24 observations
## predicted class=Y expected loss=0.375 P(node) =0.04519774
## class counts: 9 15
## probabilities: 0.375 0.625
##
## Node number 10: 74 observations
## predicted class=N expected loss=0.1621622 P(node) =0.1393597
## class counts: 62 12
## probabilities: 0.838 0.162
##
## Node number 11: 136 observations, complexity param=0.007009346
## predicted class=N expected loss=0.3088235 P(node) =0.2561205
## class counts: 94 42
## probabilities: 0.691 0.309
## left son=22 (28 obs) right son=23 (108 obs)
## Primary splits:
## spdiff.cut.fctr(100,1e+03] < 0.5 to the right, improve=2.8683470, (0 missing)
## spdiff.cut.fctr(1,10] < 0.5 to the left, improve=0.7944005, (0 missing)
## sprice.root2 < 9.643093 to the right, improve=0.4963235, (0 missing)
## spdiff.cut.fctr(10,100] < 0.5 to the left, improve=0.0175102, (0 missing)
## Surrogate splits:
## sprice.root2 < 5.417281 to the left, agree=0.875, adj=0.393, (0 split)
##
## Node number 22: 28 observations
## predicted class=N expected loss=0.1071429 P(node) =0.0527307
## class counts: 25 3
## probabilities: 0.893 0.107
##
## Node number 23: 108 observations, complexity param=0.007009346
## predicted class=N expected loss=0.3611111 P(node) =0.2033898
## class counts: 69 39
## probabilities: 0.639 0.361
## left son=46 (101 obs) right son=47 (7 obs)
## Primary splits:
## sprice.root2 < 8.366301 to the right, improve=1.8672800, (0 missing)
## spdiff.cut.fctr(10,100] < 0.5 to the right, improve=0.5761905, (0 missing)
## spdiff.cut.fctr(1,10] < 0.5 to the left, improve=0.1853070, (0 missing)
##
## Node number 46: 101 observations, complexity param=0.007009346
## predicted class=N expected loss=0.3366337 P(node) =0.1902072
## class counts: 67 34
## probabilities: 0.663 0.337
## left son=92 (39 obs) right son=93 (62 obs)
## Primary splits:
## sprice.root2 < 13.60037 to the right, improve=0.378555200, (0 missing)
## spdiff.cut.fctr(10,100] < 0.5 to the right, improve=0.023664990, (0 missing)
## spdiff.cut.fctr(1,10] < 0.5 to the left, improve=0.005462615, (0 missing)
##
## Node number 47: 7 observations
## predicted class=Y expected loss=0.2857143 P(node) =0.01318267
## class counts: 2 5
## probabilities: 0.286 0.714
##
## Node number 92: 39 observations, complexity param=0.003115265
## predicted class=N expected loss=0.2820513 P(node) =0.07344633
## class counts: 28 11
## probabilities: 0.718 0.282
## left son=184 (8 obs) right son=185 (31 obs)
## Primary splits:
## sprice.root2 < 13.92834 to the left, improve=1.6013230000, (0 missing)
## spdiff.cut.fctr(10,100] < 0.5 to the left, improve=0.2005861000, (0 missing)
## spdiff.cut.fctr(1,10] < 0.5 to the left, improve=0.0002289377, (0 missing)
##
## Node number 93: 62 observations, complexity param=0.007009346
## predicted class=N expected loss=0.3709677 P(node) =0.1167608
## class counts: 39 23
## probabilities: 0.629 0.371
## left son=186 (54 obs) right son=187 (8 obs)
## Primary splits:
## sprice.root2 < 13.25271 to the left, improve=1.185484000, (0 missing)
## spdiff.cut.fctr(10,100] < 0.5 to the right, improve=0.243176200, (0 missing)
## spdiff.cut.fctr(1,10] < 0.5 to the right, improve=0.003665689, (0 missing)
##
## Node number 184: 8 observations
## predicted class=N expected loss=0 P(node) =0.01506591
## class counts: 8 0
## probabilities: 1.000 0.000
##
## Node number 185: 31 observations, complexity param=0.003115265
## predicted class=N expected loss=0.3548387 P(node) =0.05838041
## class counts: 20 11
## probabilities: 0.645 0.355
## left son=370 (11 obs) right son=371 (20 obs)
## Primary splits:
## spdiff.cut.fctr(10,100] < 0.5 to the left, improve=0.22991200, (0 missing)
## sprice.root2 < 14.98331 to the left, improve=0.09831029, (0 missing)
## Surrogate splits:
## spdiff.cut.fctr(1,10] < 0.5 to the right, agree=0.774, adj=0.364, (0 split)
##
## Node number 186: 54 observations, complexity param=0.007009346
## predicted class=N expected loss=0.3333333 P(node) =0.1016949
## class counts: 36 18
## probabilities: 0.667 0.333
## left son=372 (14 obs) right son=373 (40 obs)
## Primary splits:
## sprice.root2 < 12.32836 to the right, improve=0.5357143, (0 missing)
## spdiff.cut.fctr(10,100] < 0.5 to the right, improve=0.1500000, (0 missing)
##
## Node number 187: 8 observations
## predicted class=Y expected loss=0.375 P(node) =0.01506591
## class counts: 3 5
## probabilities: 0.375 0.625
##
## Node number 370: 11 observations
## predicted class=N expected loss=0.2727273 P(node) =0.02071563
## class counts: 8 3
## probabilities: 0.727 0.273
##
## Node number 371: 20 observations, complexity param=0.003115265
## predicted class=N expected loss=0.4 P(node) =0.03766478
## class counts: 12 8
## probabilities: 0.600 0.400
## left son=742 (13 obs) right son=743 (7 obs)
## Primary splits:
## sprice.root2 < 14.14196 to the right, improve=0.632967, (0 missing)
##
## Node number 372: 14 observations
## predicted class=N expected loss=0.2142857 P(node) =0.02636535
## class counts: 11 3
## probabilities: 0.786 0.214
##
## Node number 373: 40 observations, complexity param=0.007009346
## predicted class=N expected loss=0.375 P(node) =0.07532957
## class counts: 25 15
## probabilities: 0.625 0.375
## left son=746 (33 obs) right son=747 (7 obs)
## Primary splits:
## sprice.root2 < 12.16525 to the left, improve=1.9534630, (0 missing)
## spdiff.cut.fctr(10,100] < 0.5 to the right, improve=0.1535088, (0 missing)
##
## Node number 742: 13 observations
## predicted class=N expected loss=0.3076923 P(node) =0.02448211
## class counts: 9 4
## probabilities: 0.692 0.308
##
## Node number 743: 7 observations
## predicted class=Y expected loss=0.4285714 P(node) =0.01318267
## class counts: 3 4
## probabilities: 0.429 0.571
##
## Node number 746: 33 observations, complexity param=0.007009346
## predicted class=N expected loss=0.3030303 P(node) =0.06214689
## class counts: 23 10
## probabilities: 0.697 0.303
## left son=1492 (7 obs) right son=1493 (26 obs)
## Primary splits:
## sprice.root2 < 10.83799 to the right, improve=1.63170200, (0 missing)
## spdiff.cut.fctr(1,10] < 0.5 to the right, improve=0.70129870, (0 missing)
## spdiff.cut.fctr(10,100] < 0.5 to the left, improve=0.07272727, (0 missing)
##
## Node number 747: 7 observations
## predicted class=Y expected loss=0.2857143 P(node) =0.01318267
## class counts: 2 5
## probabilities: 0.286 0.714
##
## Node number 1492: 7 observations
## predicted class=N expected loss=0 P(node) =0.01318267
## class counts: 7 0
## probabilities: 1.000 0.000
##
## Node number 1493: 26 observations, complexity param=0.007009346
## predicted class=N expected loss=0.3846154 P(node) =0.04896422
## class counts: 16 10
## probabilities: 0.615 0.385
## left son=2986 (12 obs) right son=2987 (14 obs)
## Primary splits:
## spdiff.cut.fctr(1,10] < 0.5 to the right, improve=2.1172160, (0 missing)
## spdiff.cut.fctr(10,100] < 0.5 to the left, improve=0.9864802, (0 missing)
## sprice.root2 < 9.219273 to the left, improve=0.4188034, (0 missing)
## Surrogate splits:
## spdiff.cut.fctr(10,100] < 0.5 to the left, agree=0.885, adj=0.750, (0 split)
## sprice.root2 < 9.848334 to the right, agree=0.692, adj=0.333, (0 split)
##
## Node number 2986: 12 observations
## predicted class=N expected loss=0.1666667 P(node) =0.02259887
## class counts: 10 2
## probabilities: 0.833 0.167
##
## Node number 2987: 14 observations
## predicted class=Y expected loss=0.4285714 P(node) =0.02636535
## class counts: 6 8
## probabilities: 0.429 0.571
##
## n= 531
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 531 107 N (0.7984934 0.2015066)
## 2) spdiff.cut.fctr(-10,-1]< 0.5 489 86 N (0.8241309 0.1758691)
## 4) sprice.root2>=15.57156 279 32 N (0.8853047 0.1146953) *
## 5) sprice.root2< 15.57156 210 54 N (0.7428571 0.2571429)
## 10) spdiff.cut.fctr(-100,-10]>=0.5 74 12 N (0.8378378 0.1621622) *
## 11) spdiff.cut.fctr(-100,-10]< 0.5 136 42 N (0.6911765 0.3088235)
## 22) spdiff.cut.fctr(100,1e+03]>=0.5 28 3 N (0.8928571 0.1071429) *
## 23) spdiff.cut.fctr(100,1e+03]< 0.5 108 39 N (0.6388889 0.3611111)
## 46) sprice.root2>=8.366301 101 34 N (0.6633663 0.3366337)
## 92) sprice.root2>=13.60037 39 11 N (0.7179487 0.2820513)
## 184) sprice.root2< 13.92834 8 0 N (1.0000000 0.0000000) *
## 185) sprice.root2>=13.92834 31 11 N (0.6451613 0.3548387)
## 370) spdiff.cut.fctr(10,100]< 0.5 11 3 N (0.7272727 0.2727273) *
## 371) spdiff.cut.fctr(10,100]>=0.5 20 8 N (0.6000000 0.4000000)
## 742) sprice.root2>=14.14196 13 4 N (0.6923077 0.3076923) *
## 743) sprice.root2< 14.14196 7 3 Y (0.4285714 0.5714286) *
## 93) sprice.root2< 13.60037 62 23 N (0.6290323 0.3709677)
## 186) sprice.root2< 13.25271 54 18 N (0.6666667 0.3333333)
## 372) sprice.root2>=12.32836 14 3 N (0.7857143 0.2142857) *
## 373) sprice.root2< 12.32836 40 15 N (0.6250000 0.3750000)
## 746) sprice.root2< 12.16525 33 10 N (0.6969697 0.3030303)
## 1492) sprice.root2>=10.83799 7 0 N (1.0000000 0.0000000) *
## 1493) sprice.root2< 10.83799 26 10 N (0.6153846 0.3846154)
## 2986) spdiff.cut.fctr(1,10]>=0.5 12 2 N (0.8333333 0.1666667) *
## 2987) spdiff.cut.fctr(1,10]< 0.5 14 6 Y (0.4285714 0.5714286) *
## 747) sprice.root2>=12.16525 7 2 Y (0.2857143 0.7142857) *
## 187) sprice.root2>=13.25271 8 3 Y (0.3750000 0.6250000) *
## 47) sprice.root2< 8.366301 7 2 Y (0.2857143 0.7142857) *
## 3) spdiff.cut.fctr(-10,-1]>=0.5 42 21 N (0.5000000 0.5000000)
## 6) sprice.root2>=14.22432 18 6 N (0.6666667 0.3333333) *
## 7) sprice.root2< 14.22432 24 9 Y (0.3750000 0.6250000) *
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.cp.0.rpart.N
## 1 N 378
## 2 Y 55
## sold.fctr.predict.Max.cor.Y.rcv.1X1.cp.0.rpart.Y
## 1 46
## 2 52
## Prediction
## Reference N Y
## N 378 46
## Y 55 52
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.8097928 0.3897455 0.7737799 0.8423139 0.7984934
## AccuracyPValue McnemarPValue
## 0.2782704 0.4260147
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.cp.0.rpart.N
## 1 N 295
## 2 Y 57
## sold.fctr.predict.Max.cor.Y.rcv.1X1.cp.0.rpart.Y
## 1 79
## 2 53
## Prediction
## Reference N Y
## N 295 79
## Y 57 53
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.71900826 0.25274725 0.67667473 0.75864224 0.77272727
## AccuracyPValue McnemarPValue
## 0.99752896 0.07174464
## id feats
## 1 Max.cor.Y.rcv.1X1.cp.0.rpart spdiff.cut.fctr,sprice.root2
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 0 0.693 0.015
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.6667806 0.9410377 0.3925234 0.7336449
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.3 0.5073171 0.8097928
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7737799 0.8423139 0.3897455
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5914439 0.9010695 0.2818182 0.628087
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.4380165 0.7190083
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.6766747 0.7586422 0.2527473
#stop(here"); glb_to_sav(); all.equal(glb_models_df, sav_models_df)
# if (glb_is_regression || glb_is_binomial) # For multinomials this model will be run next by default
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Max.cor.Y",
type=glb_model_type, trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds, trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method="rpart")),
indep_vars=max_cor_y_x_vars, rsp_var=glb_rsp_var,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
## [1] "fitting model: Max.cor.Y.rpart"
## [1] " indep_vars: spdiff.cut.fctr,sprice.root2"
## + Fold1.Rep1: cp=0
## - Fold1.Rep1: cp=0
## + Fold2.Rep1: cp=0
## - Fold2.Rep1: cp=0
## + Fold3.Rep1: cp=0
## - Fold3.Rep1: cp=0
## + Fold1.Rep2: cp=0
## - Fold1.Rep2: cp=0
## + Fold2.Rep2: cp=0
## - Fold2.Rep2: cp=0
## + Fold3.Rep2: cp=0
## - Fold3.Rep2: cp=0
## + Fold1.Rep3: cp=0
## - Fold1.Rep3: cp=0
## + Fold2.Rep3: cp=0
## - Fold2.Rep3: cp=0
## + Fold3.Rep3: cp=0
## - Fold3.Rep3: cp=0
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.028 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Max.cor.Y", : model's bestTune found at an extreme of
## tuneGrid for parameter: cp
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 531
##
## CP nsplit rel error
## 1 0.02803738 0 1
##
## Node number 1: 531 observations
## predicted class=N expected loss=0.2015066 P(node) =1
## class counts: 424 107
## probabilities: 0.798 0.202
##
## n= 531
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 531 107 N (0.7984934 0.2015066) *
## sold.fctr sold.fctr.predict.Max.cor.Y.rpart.Y
## 1 N 424
## 2 Y 107
## Prediction
## Reference N Y
## N 0 424
## Y 0 107
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 2.015066e-01 0.000000e+00 1.681880e-01 2.381872e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 8.945646e-94
## sold.fctr sold.fctr.predict.Max.cor.Y.rpart.Y
## 1 N 374
## 2 Y 110
## Prediction
## Reference N Y
## N 0 374
## Y 0 110
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 2.272727e-01 0.000000e+00 1.906680e-01 2.672469e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 6.857299e-83
## id feats max.nTuningRuns
## 1 Max.cor.Y.rpart spdiff.cut.fctr,sprice.root2 5
## min.elapsedtime.everything min.elapsedtime.final max.AUCpROC.fit
## 1 1.526 0.013 0.5
## max.Sens.fit max.Spec.fit max.AUCROCR.fit opt.prob.threshold.fit
## 1 1 0 0.5 0.2
## max.f.score.fit max.Accuracy.fit max.AccuracyLower.fit
## 1 0.3354232 0.7897092 0.168188
## max.AccuracyUpper.fit max.Kappa.fit max.AUCpROC.OOB max.Sens.OOB
## 1 0.2381872 0.06035055 0.5 1
## max.Spec.OOB max.AUCROCR.OOB opt.prob.threshold.OOB max.f.score.OOB
## 1 0 0.5 0.2 0.3703704
## max.Accuracy.OOB max.AccuracyLower.OOB max.AccuracyUpper.OOB
## 1 0.2272727 0.190668 0.2672469
## max.Kappa.OOB max.AccuracySD.fit max.KappaSD.fit
## 1 0 0.01480344 0.06208502
if (!is.null(glb_date_vars) &&
(sum(grepl(paste(glb_date_vars, "\\.day\\.minutes\\.poly\\.", sep=""),
names(glb_allobs_df))) > 0)) {
# ret_lst <- myfit_mdl(mdl_id="Max.cor.Y.TmSrs.poly1",
# model_method=ifelse(glb_is_regression, "lm",
# ifelse(glb_is_binomial, "glm", "rpart")),
# model_type=glb_model_type,
# indep_vars_vctr=c(max_cor_y_x_vars, paste0(glb_date_vars, ".day.minutes")),
# rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
# fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
# n_cv_folds=glb_rcv_n_folds, tune_models_df=NULL)
#
ret_lst <- myfit_mdl(mdl_id="Max.cor.Y.TmSrs.poly",
model_method=ifelse(glb_is_regression, "lm",
ifelse(glb_is_binomial, "glm", "rpart")),
model_type=glb_model_type,
indep_vars_vctr=c(max_cor_y_x_vars,
grep(paste(glb_date_vars, "\\.day\\.minutes\\.poly\\.", sep=""),
names(glb_allobs_df), value=TRUE)),
rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
n_cv_folds=glb_rcv_n_folds, tune_models_df=NULL)
}
# Interactions.High.cor.Y
if (length(int_feats <- setdiff(setdiff(unique(glb_feats_df$cor.high.X), NA),
subset(glb_feats_df, nzv)$id)) > 0) {
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Interact.High.cor.Y",
type=glb_model_type, trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds, trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method="glmnet")),
indep_vars=c(max_cor_y_x_vars, paste(max_cor_y_x_vars[1], int_feats, sep=":")),
rsp_var=glb_rsp_var,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
}
## [1] "fitting model: Interact.High.cor.Y.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.root2,spdiff.cut.fctr:D.terms.post.stop.n.log,spdiff.cut.fctr:D.weight.post.stem.sum,spdiff.cut.fctr:D.chrs.n.log,spdiff.cut.fctr:D.terms.post.stem.n.log,spdiff.cut.fctr:startprice.dcm1.is9,spdiff.cut.fctr:sprice.root2"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.0377 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Interact.High.cor.Y", : model's bestTune found at an
## extreme of tuneGrid for parameter: lambda
## Length Class Mode
## a0 100 -none- numeric
## beta 5500 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 55 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -1.001998231
## spdiff.cut.fctr(-10,-1]
## 1.008973144
## spdiff.cut.fctr(-1,0]
## 1.656927178
## spdiff.cut.fctr(1,10]
## 0.384536949
## spdiff.cut.fctr(10,100]
## 0.071932599
## sprice.root2
## -0.034891514
## spdiff.cut.fctr(-1e+03,-100]:D.chrs.n.log
## -0.032476910
## spdiff.cut.fctr(-1e+03,-100]:D.weight.post.stem.sum
## -0.058337094
## spdiff.cut.fctr(-1,0]:sprice.root2
## 0.036870844
## spdiff.cut.fctr(10,100]:sprice.root2
## 0.015874997
## spdiff.cut.fctr(100,1e+03]:sprice.root2
## -0.008707444
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -0.98777320
## spdiff.cut.fctr(-10,-1]
## 1.06410407
## spdiff.cut.fctr(-1,0]
## 1.75161339
## spdiff.cut.fctr(1,10]
## 0.44646355
## spdiff.cut.fctr(10,100]
## 0.07872601
## sprice.root2
## -0.03691858
## spdiff.cut.fctr(-1e+03,-100]:D.chrs.n.log
## -0.03755416
## spdiff.cut.fctr(-1e+03,-100]:D.weight.post.stem.sum
## -0.06639158
## spdiff.cut.fctr(-1,0]:sprice.root2
## 0.04126771
## spdiff.cut.fctr(10,100]:sprice.root2
## 0.01871424
## spdiff.cut.fctr(100,1e+03]:sprice.root2
## -0.01485420
## sold.fctr sold.fctr.predict.Interact.High.cor.Y.glmnet.N
## 1 N 291
## 2 Y 31
## sold.fctr.predict.Interact.High.cor.Y.glmnet.Y
## 1 133
## 2 76
## Prediction
## Reference N Y
## N 291 133
## Y 31 76
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.911488e-01 2.924027e-01 6.499247e-01 7.302296e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 3.100958e-15
## sold.fctr sold.fctr.predict.Interact.High.cor.Y.glmnet.N
## 1 N 235
## 2 Y 30
## sold.fctr.predict.Interact.High.cor.Y.glmnet.Y
## 1 139
## 2 80
## Prediction
## Reference N Y
## N 235 139
## Y 30 80
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.508264e-01 2.634707e-01 6.065024e-01 6.932948e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 9.758116e-17
## id
## 1 Interact.High.cor.Y.glmnet
## feats
## 1 spdiff.cut.fctr,sprice.root2,spdiff.cut.fctr:D.terms.post.stop.n.log,spdiff.cut.fctr:D.weight.post.stem.sum,spdiff.cut.fctr:D.chrs.n.log,spdiff.cut.fctr:D.terms.post.stem.n.log,spdiff.cut.fctr:startprice.dcm1.is9,spdiff.cut.fctr:sprice.root2
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 11.792 1.396
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5233645 1 0.04672897 0.7465945
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.4810127 0.8098079
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.6499247 0.7302296 0.08548063
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5187166 0.9919786 0.04545455 0.7259844
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.4863222 0.6508264
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.6065024 0.6932948 0.2634707
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.008044233 0.05798146
# Low.cor.X
# if (glb_is_classification && glb_is_binomial)
# indep_vars_vctr <- subset(glb_feats_df, is.na(cor.high.X) &
# is.ConditionalX.y &
# (exclude.as.feat != 1))[, "id"] else
indep_vars <- subset(glb_feats_df, is.na(cor.high.X) & !nzv &
(exclude.as.feat != 1))[, "id"]
indep_vars <- myadjust_interaction_feats(indep_vars)
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix="Low.cor.X",
type=glb_model_type, trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds, trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method="glmnet")),
indep_vars=indep_vars, rsp_var=glb_rsp_var,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
## [1] "fitting model: Low.cor.X.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.d20nexp,storage.fctr,D.ratio.weight.sum.wrds.n,.rnorm,startprice.dgt2.is9,D.wrds.stop.n.log,D.weight.sum.stem.stop.Ratio,cellular.fctr,startprice.dgt1.is9,prdl.descr.my.fctr,D.terms.post.stop.n.log,D.chrs.pnct11.n.log,color.fctr,D.chrs.pnct13.n.log,startprice.dcm1.is9,condition.fctr,sprice.root2,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.0377 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = "Low.cor.X", : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 100 -none- numeric
## beta 12800 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 128 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -0.99818039
## D.chrs.pnct11.n.log
## -0.01267135
## D.terms.post.stop.n.log
## -0.02545365
## prdl.descr.my.fctriPad4#0
## 0.05358214
## prdl.descr.my.fctriPadAir#1
## -0.13665317
## spdiff.cut.fctr(-10,-1]
## 1.04927486
## spdiff.cut.fctr(-1,0]
## 2.07628953
## spdiff.cut.fctr(1,10]
## 0.42865652
## spdiff.cut.fctr(10,100]
## 0.33336829
## sprice.root2
## -0.03698547
## cellular.fctr1:carrier.fctrUnknown
## -0.08089084
## prdl.descr.my.fctrUnknown#1:.clusterid.fctr3
## 1.71471655
## prdl.descr.my.fctriPadmini#1:.clusterid.fctr3
## 0.80687870
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -0.985103766
## D.chrs.pnct11.n.log
## -0.048861736
## D.terms.post.stop.n.log
## -0.037606690
## prdl.descr.my.fctriPad4#0
## 0.126911687
## prdl.descr.my.fctriPadAir#1
## -0.184351952
## spdiff.cut.fctr(-10,-1]
## 1.111275547
## spdiff.cut.fctr(-1,0]
## 2.230822326
## spdiff.cut.fctr(1,10]
## 0.496301462
## spdiff.cut.fctr(10,100]
## 0.382872499
## spdiff.cut.fctr(100,1e+03]
## -0.009470411
## sprice.root2
## -0.038390135
## cellular.fctr1:carrier.fctrUnknown
## -0.150443891
## prdl.descr.my.fctrUnknown#1:.clusterid.fctr3
## 1.943821473
## prdl.descr.my.fctriPad1#1:.clusterid.fctr3
## 0.017204070
## prdl.descr.my.fctriPadmini#1:.clusterid.fctr3
## 1.002253926
## sold.fctr sold.fctr.predict.Low.cor.X.glmnet.N
## 1 N 298
## 2 Y 36
## sold.fctr.predict.Low.cor.X.glmnet.Y
## 1 126
## 2 71
## Prediction
## Reference N Y
## N 298 126
## Y 36 71
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.949153e-01 2.787383e-01 6.538034e-01 7.338414e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 2.700298e-12
## sold.fctr sold.fctr.predict.Low.cor.X.glmnet.N
## 1 N 249
## 2 Y 36
## sold.fctr.predict.Low.cor.X.glmnet.Y
## 1 125
## 2 74
## Prediction
## Reference N Y
## N 249 125
## Y 36 74
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.673554e-01 2.633111e-01 6.234210e-01 7.092300e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 4.051548e-12
## id
## 1 Low.cor.X.glmnet
## feats
## 1 spdiff.cut.fctr,sprice.d20nexp,storage.fctr,D.ratio.weight.sum.wrds.n,.rnorm,startprice.dgt2.is9,D.wrds.stop.n.log,D.weight.sum.stem.stop.Ratio,cellular.fctr,startprice.dgt1.is9,prdl.descr.my.fctr,D.terms.post.stop.n.log,D.chrs.pnct11.n.log,color.fctr,D.chrs.pnct13.n.log,startprice.dcm1.is9,condition.fctr,sprice.root2,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 9.019 0.992
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5373832 1 0.07476636 0.7637762
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.4671053 0.8048071
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.6538034 0.7338414 0.0825303
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5406417 0.9812834 0.1 0.710756
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.4789644 0.6673554
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.623421 0.70923 0.2633111
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.01292008 0.05420123
rm(ret_lst)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor label_minor bgn end elapsed
## 10 fit.models 6 0 0 221.813 298.939 77.126
## 11 fit.models 6 1 1 298.940 NA NA
fit.models_1_chunk_df <- myadd_chunk(NULL, "fit.models_1_bgn", label.minor="setup")
## label step_major step_minor label_minor bgn end elapsed
## 1 fit.models_1_bgn 1 0 setup 309.17 NA NA
#stop(here"); glb_to_sav(); all.equal(glb_models_df, sav_models_df)
topindep_var <- NULL; interact_vars <- NULL;
for (mdl_id_pfx in names(glb_mdl_family_lst)) {
fit.models_1_chunk_df <- myadd_chunk(fit.models_1_chunk_df,
paste0("fit.models_1_", mdl_id_pfx), major.inc = TRUE, label.minor = "setup")
indep_vars <- NULL;
if (grepl("\\.Interact", mdl_id_pfx)) {
if (is.null(topindep_var) && is.null(interact_vars)) {
# select best glmnet model upto now
dsp_models_df <- orderBy(model_sel_frmla <- get_model_sel_frmla(),
glb_models_df)
dsp_models_df <- subset(dsp_models_df, grepl(".glmnet", id, fixed=TRUE))
bst_mdl_id <- dsp_models_df$id[1]
mdl_id_pfx <-
paste(c(head(unlist(strsplit(bst_mdl_id, "[.]")), -1), "Interact"),
collapse=".")
# select most importance feature
if (is.null(bst_featsimp_df <-
myget_feats_importance(glb_models_lst[[bst_mdl_id]]))) {
warning("Base model for RFE.Interact: ", bst_mdl_id,
" has no important features")
next
}
topindep_ix <- 1
while (is.null(topindep_var) && (topindep_ix <= nrow(bst_featsimp_df))) {
topindep_var <- row.names(bst_featsimp_df)[topindep_ix]
if (grepl(".fctr", topindep_var, fixed=TRUE))
topindep_var <- paste0(unlist(strsplit(topindep_var, ".fctr"))[1],
".fctr")
if (topindep_var %in% names(glb_interaction_only_feats_lst)) {
topindep_var <- NULL; topindep_ix <- topindep_ix + 1
} else break
}
# select features with importance > max(10, importance of .rnorm) & is not highest
# combine factor dummy features to just the factor feature
if (length(pos_rnorm <-
grep(".rnorm", row.names(bst_featsimp_df), fixed=TRUE)) > 0)
imp_rnorm <- bst_featsimp_df[pos_rnorm, 1] else
imp_rnorm <- NA
importance_cutoff <- max(10, imp_rnorm, na.rm=TRUE)
interact_vars <-
tail(row.names(subset(bst_featsimp_df, importance > importance_cutoff)), -1)
if (length(interact_vars) > 0) {
interact_vars <-
myadjust_interaction_feats(myextract_actual_feats(interact_vars))
interact_vars <-
interact_vars[!grepl(topindep_var, interact_vars, fixed=TRUE)]
}
### bid0_sp only
# interact_vars <- c(
# "biddable", "D.ratio.sum.TfIdf.wrds.n", "D.TfIdf.sum.stem.stop.Ratio", "D.sum.TfIdf",
# "D.TfIdf.sum.post.stop", "D.TfIdf.sum.post.stem", "D.ratio.wrds.stop.n.wrds.n", "D.chrs.uppr.n.log",
# "D.chrs.n.log", "color.fctr"
# # , "condition.fctr", "prdl.my.descr.fctr"
# )
# interact_vars <- setdiff(interact_vars, c("startprice.dgt2.is9", "color.fctr"))
###
indep_vars <- myextract_actual_feats(row.names(bst_featsimp_df))
indep_vars <- setdiff(indep_vars, topindep_var)
if (length(interact_vars) > 0) {
indep_vars <- setdiff(indep_vars, myextract_actual_feats(interact_vars))
indep_vars <- c(indep_vars,
paste(topindep_var, setdiff(interact_vars, topindep_var), sep = "*"))
} else indep_vars <- union(indep_vars, topindep_var)
}
}
if (is.null(indep_vars))
indep_vars <- glb_mdl_feats_lst[[mdl_id_pfx]]
if (is.null(indep_vars) && grepl("RFE\\.", mdl_id_pfx))
indep_vars <- myextract_actual_feats(predictors(rfe_fit_results))
if (is.null(indep_vars))
indep_vars <- subset(glb_feats_df, !nzv & (exclude.as.feat != 1))[, "id"]
indep_vars <- myadjust_interaction_feats(indep_vars)
if (grepl("\\.Interact", mdl_id_pfx)) {
# if (method != tail(unlist(strsplit(bst_mdl_id, "[.]")), 1)) next
if (is.null(glb_mdl_family_lst[[mdl_id_pfx]])) {
if (!is.null(glb_mdl_family_lst[["Best.Interact"]]))
glb_mdl_family_lst[[mdl_id_pfx]] <- glb_mdl_family_lst[["Best.Interact"]]
}
}
if (is.null(glb_mdl_family_lst[[mdl_id_pfx]]))
mdl_methods <- glbMdlMethods else
mdl_methods <- glb_mdl_family_lst[[mdl_id_pfx]]
for (method in mdl_methods) {
if (method %in% c("rpart", "rf")) {
# rpart: fubar's the tree
# rf: skip the scenario w/ .rnorm for speed
indep_vars <- setdiff(indep_vars, c(".rnorm"))
#mdl_id <- paste0(mdl_id_pfx, ".no.rnorm")
}
fit.models_1_chunk_df <- myadd_chunk(fit.models_1_chunk_df,
paste0("fit.models_1_", mdl_id_pfx), major.inc = FALSE,
label.minor = method)
ret_lst <-
myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = mdl_id_pfx,
type = glb_model_type, tune.df = glb_tune_models_df,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
#trainControl.allowParallel = FALSE,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = method)),
indep_vars = indep_vars, rsp_var = glb_rsp_var,
fit_df = glb_fitobs_df, OOB_df = glb_OOBobs_df)
}
}
## label step_major step_minor label_minor bgn end
## 1 fit.models_1_bgn 1 0 setup 309.17 309.18
## 2 fit.models_1_RFE.X 2 0 setup 309.18 NA
## elapsed
## 1 0.01
## 2 NA
## label step_major step_minor label_minor bgn end
## 2 fit.models_1_RFE.X 2 0 setup 309.180 309.188
## 3 fit.models_1_RFE.X 2 1 glm 309.189 NA
## elapsed
## 2 0.008
## 3 NA
## [1] "fitting model: RFE.X.glm"
## [1] " indep_vars: spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## + Fold1.Rep1: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold1.Rep1: parameter=none
## + Fold2.Rep1: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold2.Rep1: parameter=none
## + Fold3.Rep1: parameter=none
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold3.Rep1: parameter=none
## + Fold1.Rep2: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold1.Rep2: parameter=none
## + Fold2.Rep2: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold2.Rep2: parameter=none
## + Fold3.Rep2: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold3.Rep2: parameter=none
## + Fold1.Rep3: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold1.Rep3: parameter=none
## + Fold2.Rep3: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold2.Rep3: parameter=none
## + Fold3.Rep3: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold3.Rep3: parameter=none
## Aggregating results
## Fitting final model on full training set
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1258 -0.5147 -0.0075 0.0000 2.5354
##
## Coefficients: (54 not defined because of singularities)
## Estimate Std. Error
## (Intercept) -35.5251 2347.4261
## D.chrs.n.log -30.4194 22.2223
## D.chrs.pnct11.n.log -1.7478 1.2744
## D.chrs.uppr.n.log 20.3009 18.1866
## D.ratio.weight.sum.wrds.n -6.2341 3.4056
## D.ratio.wrds.stop.n.wrds.n -1.7114 11.5441
## D.terms.post.stem.n.log -18.8162 22.3418
## D.terms.post.stop.n.log 8.5703 21.8389
## D.weight.post.stem.sum -0.5341 1.9540
## D.weight.post.stop.sum 0.7282 1.9242
## D.weight.sum NA NA
## D.wrds.n.log 11.3755 7.9906
## D.wrds.stop.n.log -0.4428 1.9674
## D.wrds.unq.n.log NA NA
## cellular.fctr1 0.9251 0.5496
## cellular.fctrUnknown 1.8047 0.9109
## color.fctrGold -0.2849 1.0652
## `color.fctrSpace Gray` 0.4894 0.6425
## color.fctrUnknown -0.1894 0.4700
## color.fctrWhite 0.1585 0.5540
## `condition.fctrFor parts or not working` -2.1007 0.9424
## `condition.fctrManufacturer refurbished` 0.1929 1.1508
## condition.fctrNew 0.2566 0.5907
## `condition.fctrNew other (see details)` 1.5485 0.9232
## `condition.fctrSeller refurbished` -1.3625 0.9607
## `prdl.descr.my.fctrUnknown#1` 41.7160 18.3730
## `prdl.descr.my.fctriPad1#0` -2.3003 1.1761
## `prdl.descr.my.fctriPad1#1` 40.2312 18.2017
## `prdl.descr.my.fctriPad2#0` -1.1311 1.1258
## `prdl.descr.my.fctriPad2#1` 41.1486 18.2673
## `prdl.descr.my.fctriPad3#0` -21.7535 12972.4937
## `prdl.descr.my.fctriPad3#1` 42.4556 18.5258
## `prdl.descr.my.fctriPad4#0` 0.2395 1.1492
## `prdl.descr.my.fctriPad4#1` 45.3112 18.6117
## `prdl.descr.my.fctriPadAir#0` 0.7392 1.0230
## `prdl.descr.my.fctriPadAir#1` 19.5044 4529.3651
## `prdl.descr.my.fctriPadAir2#0` 1.6547 1.1507
## `prdl.descr.my.fctriPadAir2#1` 60.7087 8118.7316
## `prdl.descr.my.fctriPadmini#0` -1.0812 0.9337
## `prdl.descr.my.fctriPadmini#1` 36.6547 17.3271
## `prdl.descr.my.fctriPadmini2#0` -0.2596 1.0295
## `prdl.descr.my.fctriPadmini2#1` 23.1572 9741.3543
## `prdl.descr.my.fctriPadmini3#0` 0.1433 1.2109
## `prdl.descr.my.fctriPadmini3#1` NA NA
## `spdiff.cut.fctr(-100,-10]` 35.6537 2347.3752
## `spdiff.cut.fctr(-10,-1]` 37.6737 2347.3753
## `spdiff.cut.fctr(-1,0]` 70.3553 7711.8797
## `spdiff.cut.fctr(0,1]` 36.0608 2347.3757
## `spdiff.cut.fctr(1,10]` 37.2415 2347.3753
## `spdiff.cut.fctr(10,100]` 36.3722 2347.3752
## `spdiff.cut.fctr(100,1e+03]` 30.5340 2347.3761
## sprice.d20nexp 6.4204 8.7758
## sprice.log10 2.1152 3.2146
## sprice.root2 -0.6053 0.4445
## startprice.dcm1.is9 -0.5768 0.3596
## startprice.dgt2.is9 0.7341 0.4707
## storage.fctr16 -2.0750 1.2669
## storage.fctr32 -1.4437 1.2923
## storage.fctr64 -1.0443 1.2301
## storage.fctrUnknown -2.4722 1.5219
## `cellular.fctr0:carrier.fctrNone` NA NA
## `cellular.fctr1:carrier.fctrNone` NA NA
## `cellular.fctrUnknown:carrier.fctrNone` NA NA
## `cellular.fctr0:carrier.fctrSprint` NA NA
## `cellular.fctr1:carrier.fctrSprint` 0.7685 1.2184
## `cellular.fctrUnknown:carrier.fctrSprint` NA NA
## `cellular.fctr0:carrier.fctrT-Mobile` NA NA
## `cellular.fctr1:carrier.fctrT-Mobile` 0.9858 1.2868
## `cellular.fctrUnknown:carrier.fctrT-Mobile` NA NA
## `cellular.fctr0:carrier.fctrUnknown` NA NA
## `cellular.fctr1:carrier.fctrUnknown` -2.4972 1.1422
## `cellular.fctrUnknown:carrier.fctrUnknown` NA NA
## `cellular.fctr0:carrier.fctrVerizon` NA NA
## `cellular.fctr1:carrier.fctrVerizon` 0.2172 0.7526
## `cellular.fctrUnknown:carrier.fctrVerizon` NA NA
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr2` -21.8629 6626.3538
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr2` 2.2920 1.9076
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr2` -7.3332 3.4968
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr2` 0.5491 1.6424
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr2` -4.4162 3.6158
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr2` -4.8142 8425.2137
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr2` 3.0549 7945.8675
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr2` 8.4380 4.0335
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr2` 18.1688 9741.3372
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr3` 62.3438 18431.8246
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr3` 2.2981 1.4975
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr3` -18.6935 10706.4270
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr3` 1.4952 1.6821
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr3` -16.9222 13815.8778
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr3` 8.2765 11075.5533
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr3` 30.3781 19026.8616
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr3` 25.1032 9741.3370
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr4` 0.3716 5.1331
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr4` 41.9998 19591.0749
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr4` -14.7390 13022.4113
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr4` NA NA
## z value Pr(>|z|)
## (Intercept) -0.015 0.9879
## D.chrs.n.log -1.369 0.1710
## D.chrs.pnct11.n.log -1.371 0.1702
## D.chrs.uppr.n.log 1.116 0.2643
## D.ratio.weight.sum.wrds.n -1.831 0.0672 .
## D.ratio.wrds.stop.n.wrds.n -0.148 0.8821
## D.terms.post.stem.n.log -0.842 0.3997
## D.terms.post.stop.n.log 0.392 0.6947
## D.weight.post.stem.sum -0.273 0.7846
## D.weight.post.stop.sum 0.378 0.7051
## D.weight.sum NA NA
## D.wrds.n.log 1.424 0.1546
## D.wrds.stop.n.log -0.225 0.8219
## D.wrds.unq.n.log NA NA
## cellular.fctr1 1.683 0.0923 .
## cellular.fctrUnknown 1.981 0.0476 *
## color.fctrGold -0.267 0.7891
## `color.fctrSpace Gray` 0.762 0.4462
## color.fctrUnknown -0.403 0.6869
## color.fctrWhite 0.286 0.7748
## `condition.fctrFor parts or not working` -2.229 0.0258 *
## `condition.fctrManufacturer refurbished` 0.168 0.8669
## condition.fctrNew 0.434 0.6640
## `condition.fctrNew other (see details)` 1.677 0.0935 .
## `condition.fctrSeller refurbished` -1.418 0.1561
## `prdl.descr.my.fctrUnknown#1` 2.271 0.0232 *
## `prdl.descr.my.fctriPad1#0` -1.956 0.0505 .
## `prdl.descr.my.fctriPad1#1` 2.210 0.0271 *
## `prdl.descr.my.fctriPad2#0` -1.005 0.3150
## `prdl.descr.my.fctriPad2#1` 2.253 0.0243 *
## `prdl.descr.my.fctriPad3#0` -0.002 0.9987
## `prdl.descr.my.fctriPad3#1` 2.292 0.0219 *
## `prdl.descr.my.fctriPad4#0` 0.208 0.8349
## `prdl.descr.my.fctriPad4#1` 2.435 0.0149 *
## `prdl.descr.my.fctriPadAir#0` 0.723 0.4699
## `prdl.descr.my.fctriPadAir#1` 0.004 0.9966
## `prdl.descr.my.fctriPadAir2#0` 1.438 0.1504
## `prdl.descr.my.fctriPadAir2#1` 0.007 0.9940
## `prdl.descr.my.fctriPadmini#0` -1.158 0.2469
## `prdl.descr.my.fctriPadmini#1` 2.115 0.0344 *
## `prdl.descr.my.fctriPadmini2#0` -0.252 0.8009
## `prdl.descr.my.fctriPadmini2#1` 0.002 0.9981
## `prdl.descr.my.fctriPadmini3#0` 0.118 0.9058
## `prdl.descr.my.fctriPadmini3#1` NA NA
## `spdiff.cut.fctr(-100,-10]` 0.015 0.9879
## `spdiff.cut.fctr(-10,-1]` 0.016 0.9872
## `spdiff.cut.fctr(-1,0]` 0.009 0.9927
## `spdiff.cut.fctr(0,1]` 0.015 0.9877
## `spdiff.cut.fctr(1,10]` 0.016 0.9873
## `spdiff.cut.fctr(10,100]` 0.015 0.9876
## `spdiff.cut.fctr(100,1e+03]` 0.013 0.9896
## sprice.d20nexp 0.732 0.4644
## sprice.log10 0.658 0.5105
## sprice.root2 -1.362 0.1733
## startprice.dcm1.is9 -1.604 0.1087
## startprice.dgt2.is9 1.560 0.1189
## storage.fctr16 -1.638 0.1015
## storage.fctr32 -1.117 0.2639
## storage.fctr64 -0.849 0.3959
## storage.fctrUnknown -1.624 0.1043
## `cellular.fctr0:carrier.fctrNone` NA NA
## `cellular.fctr1:carrier.fctrNone` NA NA
## `cellular.fctrUnknown:carrier.fctrNone` NA NA
## `cellular.fctr0:carrier.fctrSprint` NA NA
## `cellular.fctr1:carrier.fctrSprint` 0.631 0.5282
## `cellular.fctrUnknown:carrier.fctrSprint` NA NA
## `cellular.fctr0:carrier.fctrT-Mobile` NA NA
## `cellular.fctr1:carrier.fctrT-Mobile` 0.766 0.4436
## `cellular.fctrUnknown:carrier.fctrT-Mobile` NA NA
## `cellular.fctr0:carrier.fctrUnknown` NA NA
## `cellular.fctr1:carrier.fctrUnknown` -2.186 0.0288 *
## `cellular.fctrUnknown:carrier.fctrUnknown` NA NA
## `cellular.fctr0:carrier.fctrVerizon` NA NA
## `cellular.fctr1:carrier.fctrVerizon` 0.289 0.7729
## `cellular.fctrUnknown:carrier.fctrVerizon` NA NA
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr2` -0.003 0.9974
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr2` 1.202 0.2295
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr2` -2.097 0.0360 *
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr2` 0.334 0.7381
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr2` -1.221 0.2220
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr2` -0.001 0.9995
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr2` 0.000 0.9997
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr2` 2.092 0.0364 *
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr2` 0.002 0.9985
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr3` 0.003 0.9973
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr3` 1.535 0.1249
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr3` -0.002 0.9986
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr3` 0.889 0.3741
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr3` -0.001 0.9990
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr3` 0.001 0.9994
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr3` 0.002 0.9987
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr3` 0.003 0.9979
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr4` 0.072 0.9423
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr4` 0.002 0.9983
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr4` -0.001 0.9991
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr4` NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 533.64 on 530 degrees of freedom
## Residual deviance: 292.98 on 450 degrees of freedom
## AIC: 454.98
##
## Number of Fisher Scoring iterations: 20
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## sold.fctr sold.fctr.predict.RFE.X.glm.N sold.fctr.predict.RFE.X.glm.Y
## 1 N 366 58
## 2 Y 24 83
## Prediction
## Reference N Y
## N 366 58
## Y 24 83
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.8455743879 0.5710739405 0.8119850163 0.8752569045 0.7984934087
## AccuracyPValue McnemarPValue
## 0.0032234953 0.0002681827
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## sold.fctr sold.fctr.predict.RFE.X.glm.N sold.fctr.predict.RFE.X.glm.Y
## 1 N 294 80
## 2 Y 50 60
## Prediction
## Reference N Y
## N 294 80
## Y 50 60
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7314050 0.3024390 0.6895485 0.7704077 0.7727273
## AccuracyPValue McnemarPValue
## 0.9856745 0.0109758
## id
## 1 RFE.X.glm
## feats
## 1 spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 3.452 0.235
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.7474431 0.9528302 0.5420561 0.9143229
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.3 0.6693548 0.740737
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.811985 0.8752569 0.2025898
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.6304813 0.8609626 0.4 0.7157997
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.3 0.48 0.731405
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.6895485 0.7704077 0.302439
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.03819945 0.07904272
## label step_major step_minor label_minor bgn end
## 3 fit.models_1_RFE.X 2 1 glm 309.189 316.254
## 4 fit.models_1_RFE.X 2 2 bayesglm 316.255 NA
## elapsed
## 3 7.066
## 4 NA
## [1] "fitting model: RFE.X.bayesglm"
## [1] " indep_vars: spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## Loading required package: arm
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## Loading required package: lme4
##
## arm (Version 1.8-6, built: 2015-7-7)
##
## Working directory is /Users/bbalaji-2012/Documents/Work/Courses/MIT/Analytics_Edge_15_071x/Assignments/Kaggle_eBay_iPads
## + Fold1.Rep1: parameter=none
## - Fold1.Rep1: parameter=none
## + Fold2.Rep1: parameter=none
## - Fold2.Rep1: parameter=none
## + Fold3.Rep1: parameter=none
## - Fold3.Rep1: parameter=none
## + Fold1.Rep2: parameter=none
## - Fold1.Rep2: parameter=none
## + Fold2.Rep2: parameter=none
## - Fold2.Rep2: parameter=none
## + Fold3.Rep2: parameter=none
## - Fold3.Rep2: parameter=none
## + Fold1.Rep3: parameter=none
## - Fold1.Rep3: parameter=none
## + Fold2.Rep3: parameter=none
## - Fold2.Rep3: parameter=none
## + Fold3.Rep3: parameter=none
## - Fold3.Rep3: parameter=none
## Aggregating results
## Fitting final model on full training set
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.79624 -0.61099 -0.26459 -0.05762 2.57282
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 4.161658 4.773316
## D.chrs.n.log -0.020999 0.490973
## D.chrs.pnct11.n.log -1.049388 0.697071
## D.chrs.uppr.n.log -0.002830 0.508031
## D.ratio.weight.sum.wrds.n -0.705524 0.508432
## D.ratio.wrds.stop.n.wrds.n -2.537053 2.520613
## D.terms.post.stem.n.log -0.716722 1.004077
## D.terms.post.stop.n.log -0.763906 1.012245
## D.weight.post.stem.sum 0.186075 0.413529
## D.weight.post.stop.sum 0.130165 0.381054
## D.weight.sum 0.186075 0.413529
## D.wrds.n.log -0.083127 0.807033
## D.wrds.stop.n.log 0.011089 0.632732
## D.wrds.unq.n.log -0.716722 1.004077
## cellular.fctr1 0.194257 1.307662
## cellular.fctrUnknown 0.109155 1.653238
## color.fctrGold 0.117867 0.795133
## `color.fctrSpace Gray` 0.532468 0.521476
## color.fctrUnknown 0.024489 0.367013
## color.fctrWhite 0.076241 0.434456
## `condition.fctrFor parts or not working` -0.964225 0.602419
## `condition.fctrManufacturer refurbished` -0.138378 0.832204
## condition.fctrNew -0.162977 0.476296
## `condition.fctrNew other (see details)` 0.625466 0.658574
## `condition.fctrSeller refurbished` -1.271586 0.734735
## `prdl.descr.my.fctrUnknown#1` 0.826684 1.027997
## `prdl.descr.my.fctriPad1#0` -1.486802 0.748194
## `prdl.descr.my.fctriPad1#1` -0.339704 1.015111
## `prdl.descr.my.fctriPad2#0` -0.496076 0.752311
## `prdl.descr.my.fctriPad2#1` 0.108471 0.937693
## `prdl.descr.my.fctriPad3#0` -2.449969 1.545301
## `prdl.descr.my.fctriPad3#1` 0.628577 0.909206
## `prdl.descr.my.fctriPad4#0` 0.514127 0.779097
## `prdl.descr.my.fctriPad4#1` 0.767513 1.139790
## `prdl.descr.my.fctriPadAir#0` 0.550525 0.689132
## `prdl.descr.my.fctriPadAir#1` -1.488862 1.653360
## `prdl.descr.my.fctriPadAir2#0` 0.760994 0.729925
## `prdl.descr.my.fctriPadAir2#1` 0.990980 1.613174
## `prdl.descr.my.fctriPadmini#0` -0.665513 0.589644
## `prdl.descr.my.fctriPadmini#1` 0.313618 1.061740
## `prdl.descr.my.fctriPadmini2#0` 0.088801 0.675218
## `prdl.descr.my.fctriPadmini2#1` 0.251234 1.233713
## `prdl.descr.my.fctriPadmini3#0` -0.181686 0.833633
## `prdl.descr.my.fctriPadmini3#1` 0.000000 2.500000
## `spdiff.cut.fctr(-100,-10]` 2.344428 0.694619
## `spdiff.cut.fctr(-10,-1]` 3.965569 0.781355
## `spdiff.cut.fctr(-1,0]` 6.895235 2.108826
## `spdiff.cut.fctr(0,1]` 1.927849 1.437901
## `spdiff.cut.fctr(1,10]` 3.501817 0.792832
## `spdiff.cut.fctr(10,100]` 2.939499 0.744921
## `spdiff.cut.fctr(100,1e+03]` -1.235791 1.398909
## sprice.d20nexp 1.339686 3.599631
## sprice.log10 -0.495147 0.976624
## sprice.root2 -0.121507 0.134513
## startprice.dcm1.is9 -0.497451 0.299764
## startprice.dgt2.is9 0.756634 0.380920
## storage.fctr16 -0.501548 0.732344
## storage.fctr32 -0.063701 0.750758
## storage.fctr64 0.126034 0.743673
## storage.fctrUnknown -0.411496 0.910285
## `cellular.fctr0:carrier.fctrNone` -0.292313 1.299941
## `cellular.fctr1:carrier.fctrNone` 0.000000 2.500000
## `cellular.fctrUnknown:carrier.fctrNone` 0.000000 2.500000
## `cellular.fctr0:carrier.fctrSprint` 0.000000 2.500000
## `cellular.fctr1:carrier.fctrSprint` 0.846026 0.901519
## `cellular.fctrUnknown:carrier.fctrSprint` 0.000000 2.500000
## `cellular.fctr0:carrier.fctrT-Mobile` 0.000000 2.500000
## `cellular.fctr1:carrier.fctrT-Mobile` 0.430520 0.992720
## `cellular.fctrUnknown:carrier.fctrT-Mobile` 0.000000 2.500000
## `cellular.fctr0:carrier.fctrUnknown` 0.000000 2.500000
## `cellular.fctr1:carrier.fctrUnknown` -1.571699 0.836486
## `cellular.fctrUnknown:carrier.fctrUnknown` 0.109155 1.653238
## `cellular.fctr0:carrier.fctrVerizon` 0.000000 2.500000
## `cellular.fctr1:carrier.fctrVerizon` 0.164859 0.583239
## `cellular.fctrUnknown:carrier.fctrVerizon` 0.000000 2.500000
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr2` -1.584802 1.624609
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr2` 0.327728 1.125101
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr2` -0.825100 1.092035
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr2` 0.665989 0.996902
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr2` -0.051346 1.293655
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr2` -0.561127 1.950159
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr2` 2.565911 1.734273
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr2` 1.567335 1.251377
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr2` 0.871142 1.443044
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr3` 7.597544 2.400553
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr3` 1.121343 1.007794
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr3` -1.391065 1.636943
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr3` 0.988289 1.091466
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr3` -0.231702 2.099996
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr3` -0.042963 2.397776
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr3` 3.826774 1.925944
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr3` 2.967019 1.712896
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr4` 0.540734 1.544932
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr4` -0.009749 2.474298
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr4` -0.900190 1.752769
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr4` 0.000000 2.500000
## z value Pr(>|z|)
## (Intercept) 0.872 0.383285
## D.chrs.n.log -0.043 0.965884
## D.chrs.pnct11.n.log -1.505 0.132215
## D.chrs.uppr.n.log -0.006 0.995556
## D.ratio.weight.sum.wrds.n -1.388 0.165244
## D.ratio.wrds.stop.n.wrds.n -1.007 0.314164
## D.terms.post.stem.n.log -0.714 0.475344
## D.terms.post.stop.n.log -0.755 0.450450
## D.weight.post.stem.sum 0.450 0.652733
## D.weight.post.stop.sum 0.342 0.732659
## D.weight.sum 0.450 0.652733
## D.wrds.n.log -0.103 0.917961
## D.wrds.stop.n.log 0.018 0.986017
## D.wrds.unq.n.log -0.714 0.475344
## cellular.fctr1 0.149 0.881907
## cellular.fctrUnknown 0.066 0.947358
## color.fctrGold 0.148 0.882157
## `color.fctrSpace Gray` 1.021 0.307217
## color.fctrUnknown 0.067 0.946800
## color.fctrWhite 0.175 0.860697
## `condition.fctrFor parts or not working` -1.601 0.109468
## `condition.fctrManufacturer refurbished` -0.166 0.867938
## condition.fctrNew -0.342 0.732219
## `condition.fctrNew other (see details)` 0.950 0.342251
## `condition.fctrSeller refurbished` -1.731 0.083510 .
## `prdl.descr.my.fctrUnknown#1` 0.804 0.421299
## `prdl.descr.my.fctriPad1#0` -1.987 0.046901 *
## `prdl.descr.my.fctriPad1#1` -0.335 0.737891
## `prdl.descr.my.fctriPad2#0` -0.659 0.509638
## `prdl.descr.my.fctriPad2#1` 0.116 0.907907
## `prdl.descr.my.fctriPad3#0` -1.585 0.112868
## `prdl.descr.my.fctriPad3#1` 0.691 0.489347
## `prdl.descr.my.fctriPad4#0` 0.660 0.509318
## `prdl.descr.my.fctriPad4#1` 0.673 0.500705
## `prdl.descr.my.fctriPadAir#0` 0.799 0.424367
## `prdl.descr.my.fctriPadAir#1` -0.901 0.367850
## `prdl.descr.my.fctriPadAir2#0` 1.043 0.297150
## `prdl.descr.my.fctriPadAir2#1` 0.614 0.539014
## `prdl.descr.my.fctriPadmini#0` -1.129 0.259037
## `prdl.descr.my.fctriPadmini#1` 0.295 0.767703
## `prdl.descr.my.fctriPadmini2#0` 0.132 0.895368
## `prdl.descr.my.fctriPadmini2#1` 0.204 0.838634
## `prdl.descr.my.fctriPadmini3#0` -0.218 0.827472
## `prdl.descr.my.fctriPadmini3#1` 0.000 1.000000
## `spdiff.cut.fctr(-100,-10]` 3.375 0.000738 ***
## `spdiff.cut.fctr(-10,-1]` 5.075 3.87e-07 ***
## `spdiff.cut.fctr(-1,0]` 3.270 0.001077 **
## `spdiff.cut.fctr(0,1]` 1.341 0.180006
## `spdiff.cut.fctr(1,10]` 4.417 1.00e-05 ***
## `spdiff.cut.fctr(10,100]` 3.946 7.94e-05 ***
## `spdiff.cut.fctr(100,1e+03]` -0.883 0.377022
## sprice.d20nexp 0.372 0.709764
## sprice.log10 -0.507 0.612155
## sprice.root2 -0.903 0.366362
## startprice.dcm1.is9 -1.659 0.097020 .
## startprice.dgt2.is9 1.986 0.046997 *
## storage.fctr16 -0.685 0.493436
## storage.fctr32 -0.085 0.932382
## storage.fctr64 0.169 0.865424
## storage.fctrUnknown -0.452 0.651232
## `cellular.fctr0:carrier.fctrNone` -0.225 0.822083
## `cellular.fctr1:carrier.fctrNone` 0.000 1.000000
## `cellular.fctrUnknown:carrier.fctrNone` 0.000 1.000000
## `cellular.fctr0:carrier.fctrSprint` 0.000 1.000000
## `cellular.fctr1:carrier.fctrSprint` 0.938 0.348016
## `cellular.fctrUnknown:carrier.fctrSprint` 0.000 1.000000
## `cellular.fctr0:carrier.fctrT-Mobile` 0.000 1.000000
## `cellular.fctr1:carrier.fctrT-Mobile` 0.434 0.664523
## `cellular.fctrUnknown:carrier.fctrT-Mobile` 0.000 1.000000
## `cellular.fctr0:carrier.fctrUnknown` 0.000 1.000000
## `cellular.fctr1:carrier.fctrUnknown` -1.879 0.060254 .
## `cellular.fctrUnknown:carrier.fctrUnknown` 0.066 0.947358
## `cellular.fctr0:carrier.fctrVerizon` 0.000 1.000000
## `cellular.fctr1:carrier.fctrVerizon` 0.283 0.777436
## `cellular.fctrUnknown:carrier.fctrVerizon` 0.000 1.000000
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr2` -0.975 0.329313
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr2` 0.291 0.770832
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr2` -0.756 0.449912
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr2` 0.668 0.504096
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr2` -0.040 0.968340
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr2` -0.288 0.773550
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr2` 1.480 0.138998
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr2` 1.252 0.210392
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr2` 0.604 0.546054
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr3` 3.165 0.001551 **
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr3` 1.113 0.265850
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr3` -0.850 0.395440
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr3` 0.905 0.365217
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr3` -0.110 0.912144
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr3` -0.018 0.985704
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr3` 1.987 0.046927 *
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr3` 1.732 0.083244 .
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr4` 0.350 0.726335
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr4` -0.004 0.996856
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr4` -0.514 0.607545
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr4` 0.000 1.000000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 533.64 on 530 degrees of freedom
## Residual deviance: 328.98 on 396 degrees of freedom
## AIC: 598.98
##
## Number of Fisher Scoring iterations: 28
## sold.fctr sold.fctr.predict.RFE.X.bayesglm.N
## 1 N 373
## 2 Y 26
## sold.fctr.predict.RFE.X.bayesglm.Y
## 1 51
## 2 81
## Prediction
## Reference N Y
## N 373 51
## Y 26 81
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.8549905838 0.5855809286 0.8221278036 0.8838361063 0.7984934087
## AccuracyPValue McnemarPValue
## 0.0004722958 0.0062370075
## sold.fctr sold.fctr.predict.RFE.X.bayesglm.N
## 1 N 291
## 2 Y 51
## sold.fctr.predict.RFE.X.bayesglm.Y
## 1 83
## 2 59
## Prediction
## Reference N Y
## N 291 83
## Y 51 59
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.723140496 0.285160039 0.680961776 0.762568328 0.772727273
## AccuracyPValue McnemarPValue
## 0.995392266 0.007406496
## id
## 1 RFE.X.bayesglm
## feats
## 1 spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 6.118 0.468
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.7252579 0.9551887 0.4953271 0.8990698
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.3 0.6778243 0.7815306
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.8221278 0.8838361 0.2109032
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.6 0.9090909 0.2909091 0.7037555
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.3 0.468254 0.7231405
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.6809618 0.7625683 0.28516
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.02615618 0.0876317
## label step_major step_minor label_minor bgn end
## 4 fit.models_1_RFE.X 2 2 bayesglm 316.255 325.557
## 5 fit.models_1_RFE.X 2 3 glmnet 325.557 NA
## elapsed
## 4 9.302
## 5 NA
## [1] "fitting model: RFE.X.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.0377 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 82 -none- numeric
## beta 10988 dgCMatrix S4
## df 82 -none- numeric
## dim 2 -none- numeric
## lambda 82 -none- numeric
## dev.ratio 82 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 134 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -0.8271081607
## D.chrs.pnct11.n.log
## -0.0103224923
## D.terms.post.stem.n.log
## -0.0069752942
## D.terms.post.stop.n.log
## -0.0168454394
## D.wrds.unq.n.log
## -0.0009227177
## prdl.descr.my.fctriPad4#0
## 0.0576454262
## prdl.descr.my.fctriPadAir#1
## -0.1352704587
## spdiff.cut.fctr(-10,-1]
## 1.0532076919
## spdiff.cut.fctr(-1,0]
## 2.0774440992
## spdiff.cut.fctr(1,10]
## 0.4321274474
## spdiff.cut.fctr(10,100]
## 0.3388087876
## sprice.log10
## -0.0549995002
## sprice.root2
## -0.0292119313
## cellular.fctr1:carrier.fctrUnknown
## -0.0844204922
## prdl.descr.my.fctrUnknown#1:.clusterid.fctr3
## 1.7146623304
## prdl.descr.my.fctriPadmini#1:.clusterid.fctr3
## 0.8074756802
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -0.729511548
## D.chrs.pnct11.n.log
## -0.046380971
## D.terms.post.stem.n.log
## -0.010376680
## D.terms.post.stop.n.log
## -0.020906742
## D.wrds.unq.n.log
## -0.005677139
## prdl.descr.my.fctriPad4#0
## 0.130953904
## prdl.descr.my.fctriPadAir#1
## -0.184144655
## spdiff.cut.fctr(-10,-1]
## 1.111012644
## spdiff.cut.fctr(-1,0]
## 2.223218543
## spdiff.cut.fctr(1,10]
## 0.493784771
## spdiff.cut.fctr(10,100]
## 0.384272149
## spdiff.cut.fctr(100,1e+03]
## -0.049109520
## sprice.log10
## -0.076060913
## sprice.root2
## -0.028474574
## cellular.fctr1:carrier.fctrUnknown
## -0.153836298
## prdl.descr.my.fctrUnknown#1:.clusterid.fctr3
## 1.942581599
## prdl.descr.my.fctriPadmini#1:.clusterid.fctr3
## 1.002825431
## sold.fctr sold.fctr.predict.RFE.X.glmnet.N
## 1 N 297
## 2 Y 36
## sold.fctr.predict.RFE.X.glmnet.Y
## 1 127
## 2 71
## Prediction
## Reference N Y
## N 297 127
## Y 36 71
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.930320e-01 2.762098e-01 6.518637e-01 7.320359e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 1.797632e-12
## sold.fctr sold.fctr.predict.RFE.X.glmnet.N
## 1 N 248
## 2 Y 36
## sold.fctr.predict.RFE.X.glmnet.Y
## 1 126
## 2 74
## Prediction
## Reference N Y
## N 248 126
## Y 36 74
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.652893e-01 2.605809e-01 6.213030e-01 7.072412e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 2.700298e-12
## id
## 1 RFE.X.glmnet
## feats
## 1 spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 21.603 0.972
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5373832 1 0.07476636 0.7644375
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.4655738 0.8054348
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.6518637 0.7320359 0.08387615
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5406417 0.9812834 0.1 0.7120929
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.4774194 0.6652893
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.621303 0.7072412 0.2605809
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.0129181 0.05410138
## label step_major step_minor label_minor bgn end
## 5 fit.models_1_RFE.X 2 3 glmnet 325.557 351.063
## 6 fit.models_1_RFE.X 2 4 rpart 351.064 NA
## elapsed
## 5 25.506
## 6 NA
## [1] "fitting model: RFE.X.rpart"
## [1] " indep_vars: spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## + Fold1.Rep1: cp=0.003115
## - Fold1.Rep1: cp=0.003115
## + Fold2.Rep1: cp=0.003115
## - Fold2.Rep1: cp=0.003115
## + Fold3.Rep1: cp=0.003115
## - Fold3.Rep1: cp=0.003115
## + Fold1.Rep2: cp=0.003115
## - Fold1.Rep2: cp=0.003115
## + Fold2.Rep2: cp=0.003115
## - Fold2.Rep2: cp=0.003115
## + Fold3.Rep2: cp=0.003115
## - Fold3.Rep2: cp=0.003115
## + Fold1.Rep3: cp=0.003115
## - Fold1.Rep3: cp=0.003115
## + Fold2.Rep3: cp=0.003115
## - Fold2.Rep3: cp=0.003115
## + Fold3.Rep3: cp=0.003115
## - Fold3.Rep3: cp=0.003115
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.0374 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: cp
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 531
##
## CP nsplit rel error
## 1 0.03738318 0 1
##
## Node number 1: 531 observations
## predicted class=N expected loss=0.2015066 P(node) =1
## class counts: 424 107
## probabilities: 0.798 0.202
##
## n= 531
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 531 107 N (0.7984934 0.2015066) *
## sold.fctr sold.fctr.predict.RFE.X.rpart.Y
## 1 N 424
## 2 Y 107
## Prediction
## Reference N Y
## N 0 424
## Y 0 107
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 2.015066e-01 0.000000e+00 1.681880e-01 2.381872e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 8.945646e-94
## sold.fctr sold.fctr.predict.RFE.X.rpart.Y
## 1 N 374
## 2 Y 110
## Prediction
## Reference N Y
## N 0 374
## Y 0 110
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 2.272727e-01 0.000000e+00 1.906680e-01 2.672469e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 6.857299e-83
## id
## 1 RFE.X.rpart
## feats
## 1 spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 5 3.268 0.067
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5 1 0 0.5
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.3354232 0.7941175
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.168188 0.2381872 0.07284994
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5 1 0 0.5
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.3703704 0.2272727
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.190668 0.2672469 0
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.01260342 0.08825496
## label step_major step_minor label_minor bgn end
## 6 fit.models_1_RFE.X 2 4 rpart 351.064 357.167
## 7 fit.models_1_RFE.X 2 5 gbm 357.168 NA
## elapsed
## 6 6.103
## 7 NA
## [1] "fitting model: RFE.X.gbm"
## [1] " indep_vars: spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## Loading required package: gbm
## Loading required package: splines
## Loaded gbm 2.1.1
## Aggregating results
## Selecting tuning parameters
## Fitting n.trees = 50, interaction.depth = 1, shrinkage = 0.1, n.minobsinnode = 10 on full training set
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 4.61512051684126,
## 4.61512051684126, : variable 43: prdl.descr.my.fctriPadmini3#1 has no
## variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 4.61512051684126,
## 4.61512051684126, : variable 61: cellular.fctr1:carrier.fctrNone has no
## variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 4.61512051684126,
## 4.61512051684126, : variable 62: cellular.fctrUnknown:carrier.fctrNone has
## no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 4.61512051684126,
## 4.61512051684126, : variable 63: cellular.fctr0:carrier.fctrSprint has no
## variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 4.61512051684126,
## 4.61512051684126, : variable 65: cellular.fctrUnknown:carrier.fctrSprint
## has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 4.61512051684126,
## 4.61512051684126, : variable 66: cellular.fctr0:carrier.fctrT-Mobile has no
## variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 4.61512051684126,
## 4.61512051684126, : variable 68: cellular.fctrUnknown:carrier.fctrT-Mobile
## has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 4.61512051684126,
## 4.61512051684126, : variable 69: cellular.fctr0:carrier.fctrUnknown has no
## variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 4.61512051684126,
## 4.61512051684126, : variable 72: cellular.fctr0:carrier.fctrVerizon has no
## variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 4.61512051684126,
## 4.61512051684126, : variable 74: cellular.fctrUnknown:carrier.fctrVerizon
## has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 75:
## prdl.descr.my.fctrUnknown#0:.clusterid.fctr2 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 4.61512051684126,
## 4.61512051684126, : variable 77: prdl.descr.my.fctriPad1#0:.clusterid.fctr2
## has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 4.61512051684126,
## 4.61512051684126, : variable 79: prdl.descr.my.fctriPad2#0:.clusterid.fctr2
## has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 4.61512051684126,
## 4.61512051684126, : variable 81: prdl.descr.my.fctriPad3#0:.clusterid.fctr2
## has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 4.61512051684126,
## 4.61512051684126, : variable 83: prdl.descr.my.fctriPad4#0:.clusterid.fctr2
## has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 85:
## prdl.descr.my.fctriPadAir#0:.clusterid.fctr2 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 87:
## prdl.descr.my.fctriPadAir2#0:.clusterid.fctr2 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 89:
## prdl.descr.my.fctriPadmini#0:.clusterid.fctr2 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 91:
## prdl.descr.my.fctriPadmini2#0:.clusterid.fctr2 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 93:
## prdl.descr.my.fctriPadmini3#0:.clusterid.fctr2 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 94:
## prdl.descr.my.fctriPadmini3#1:.clusterid.fctr2 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 95:
## prdl.descr.my.fctrUnknown#0:.clusterid.fctr3 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 4.61512051684126,
## 4.61512051684126, : variable 97: prdl.descr.my.fctriPad1#0:.clusterid.fctr3
## has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0, 4.61512051684126,
## 4.61512051684126, : variable 99: prdl.descr.my.fctriPad2#0:.clusterid.fctr3
## has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 101:
## prdl.descr.my.fctriPad3#0:.clusterid.fctr3 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 103:
## prdl.descr.my.fctriPad4#0:.clusterid.fctr3 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 105:
## prdl.descr.my.fctriPadAir#0:.clusterid.fctr3 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 107:
## prdl.descr.my.fctriPadAir2#0:.clusterid.fctr3 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 108:
## prdl.descr.my.fctriPadAir2#1:.clusterid.fctr3 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 109:
## prdl.descr.my.fctriPadmini#0:.clusterid.fctr3 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 111:
## prdl.descr.my.fctriPadmini2#0:.clusterid.fctr3 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 113:
## prdl.descr.my.fctriPadmini3#0:.clusterid.fctr3 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 114:
## prdl.descr.my.fctriPadmini3#1:.clusterid.fctr3 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 115:
## prdl.descr.my.fctrUnknown#0:.clusterid.fctr4 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 117:
## prdl.descr.my.fctriPad1#0:.clusterid.fctr4 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 118:
## prdl.descr.my.fctriPad1#1:.clusterid.fctr4 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 119:
## prdl.descr.my.fctriPad2#0:.clusterid.fctr4 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 120:
## prdl.descr.my.fctriPad2#1:.clusterid.fctr4 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 121:
## prdl.descr.my.fctriPad3#0:.clusterid.fctr4 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 122:
## prdl.descr.my.fctriPad3#1:.clusterid.fctr4 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 123:
## prdl.descr.my.fctriPad4#0:.clusterid.fctr4 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 124:
## prdl.descr.my.fctriPad4#1:.clusterid.fctr4 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 125:
## prdl.descr.my.fctriPadAir#0:.clusterid.fctr4 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 127:
## prdl.descr.my.fctriPadAir2#0:.clusterid.fctr4 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 128:
## prdl.descr.my.fctriPadAir2#1:.clusterid.fctr4 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 129:
## prdl.descr.my.fctriPadmini#0:.clusterid.fctr4 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 131:
## prdl.descr.my.fctriPadmini2#0:.clusterid.fctr4 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 132:
## prdl.descr.my.fctriPadmini2#1:.clusterid.fctr4 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 133:
## prdl.descr.my.fctriPadmini3#0:.clusterid.fctr4 has no variation.
## Warning in gbm.fit(x = structure(c(0, 0, 0, 0, 0,
## 4.61512051684126, 4.61512051684126, : variable 134:
## prdl.descr.my.fctriPadmini3#1:.clusterid.fctr4 has no variation.
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 0.9988 nan 0.1000 0.0034
## 2 0.9906 nan 0.1000 0.0021
## 3 0.9867 nan 0.1000 0.0001
## 4 0.9810 nan 0.1000 0.0012
## 5 0.9767 nan 0.1000 0.0003
## 6 0.9652 nan 0.1000 0.0034
## 7 0.9615 nan 0.1000 0.0014
## 8 0.9588 nan 0.1000 0.0006
## 9 0.9531 nan 0.1000 0.0026
## 10 0.9484 nan 0.1000 0.0014
## 20 0.9190 nan 0.1000 -0.0016
## 40 0.8826 nan 0.1000 -0.0014
## 50 0.8686 nan 0.1000 -0.0008
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: n.trees
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: interaction.depth
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: shrinkage
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: n.minobsinnode
## var
## spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-10,-1]
## sprice.d20nexp sprice.d20nexp
## sprice.log10 sprice.log10
## spdiff.cut.fctr(10,100] spdiff.cut.fctr(10,100]
## spdiff.cut.fctr(1,10] spdiff.cut.fctr(1,10]
## D.chrs.uppr.n.log D.chrs.uppr.n.log
## D.ratio.weight.sum.wrds.n D.ratio.weight.sum.wrds.n
## D.ratio.wrds.stop.n.wrds.n D.ratio.wrds.stop.n.wrds.n
## storage.fctr64 storage.fctr64
## prdl.descr.my.fctriPadmini2#0 prdl.descr.my.fctriPadmini2#0
## cellular.fctr1:carrier.fctrVerizon cellular.fctr1:carrier.fctrVerizon
## prdl.descr.my.fctriPadAir#0 prdl.descr.my.fctriPadAir#0
## condition.fctrFor parts or not working condition.fctrFor parts or not working
## D.chrs.pnct11.n.log D.chrs.pnct11.n.log
## D.chrs.n.log D.chrs.n.log
## D.terms.post.stem.n.log D.terms.post.stem.n.log
## D.terms.post.stop.n.log D.terms.post.stop.n.log
## D.weight.post.stem.sum D.weight.post.stem.sum
## D.weight.post.stop.sum D.weight.post.stop.sum
## D.weight.sum D.weight.sum
## D.wrds.n.log D.wrds.n.log
## D.wrds.stop.n.log D.wrds.stop.n.log
## D.wrds.unq.n.log D.wrds.unq.n.log
## cellular.fctr1 cellular.fctr1
## cellular.fctrUnknown cellular.fctrUnknown
## color.fctrGold color.fctrGold
## color.fctrSpace Gray color.fctrSpace Gray
## color.fctrUnknown color.fctrUnknown
## color.fctrWhite color.fctrWhite
## condition.fctrManufacturer refurbished condition.fctrManufacturer refurbished
## condition.fctrNew condition.fctrNew
## condition.fctrNew other (see details) condition.fctrNew other (see details)
## condition.fctrSeller refurbished condition.fctrSeller refurbished
## prdl.descr.my.fctrUnknown#1 prdl.descr.my.fctrUnknown#1
## prdl.descr.my.fctriPad1#0 prdl.descr.my.fctriPad1#0
## prdl.descr.my.fctriPad1#1 prdl.descr.my.fctriPad1#1
## prdl.descr.my.fctriPad2#0 prdl.descr.my.fctriPad2#0
## prdl.descr.my.fctriPad2#1 prdl.descr.my.fctriPad2#1
## prdl.descr.my.fctriPad3#0 prdl.descr.my.fctriPad3#0
## prdl.descr.my.fctriPad3#1 prdl.descr.my.fctriPad3#1
## prdl.descr.my.fctriPad4#0 prdl.descr.my.fctriPad4#0
## prdl.descr.my.fctriPad4#1 prdl.descr.my.fctriPad4#1
## prdl.descr.my.fctriPadAir#1 prdl.descr.my.fctriPadAir#1
## prdl.descr.my.fctriPadAir2#0 prdl.descr.my.fctriPadAir2#0
## prdl.descr.my.fctriPadAir2#1 prdl.descr.my.fctriPadAir2#1
## prdl.descr.my.fctriPadmini#0 prdl.descr.my.fctriPadmini#0
## prdl.descr.my.fctriPadmini#1 prdl.descr.my.fctriPadmini#1
## prdl.descr.my.fctriPadmini2#1 prdl.descr.my.fctriPadmini2#1
## prdl.descr.my.fctriPadmini3#0 prdl.descr.my.fctriPadmini3#0
## prdl.descr.my.fctriPadmini3#1 prdl.descr.my.fctriPadmini3#1
## spdiff.cut.fctr(-100,-10] spdiff.cut.fctr(-100,-10]
## spdiff.cut.fctr(-1,0] spdiff.cut.fctr(-1,0]
## spdiff.cut.fctr(0,1] spdiff.cut.fctr(0,1]
## spdiff.cut.fctr(100,1e+03] spdiff.cut.fctr(100,1e+03]
## sprice.root2 sprice.root2
## startprice.dcm1.is9 startprice.dcm1.is9
## startprice.dgt2.is9 startprice.dgt2.is9
## storage.fctr16 storage.fctr16
## storage.fctr32 storage.fctr32
## storage.fctrUnknown storage.fctrUnknown
## cellular.fctr0:carrier.fctrNone cellular.fctr0:carrier.fctrNone
## cellular.fctr1:carrier.fctrNone cellular.fctr1:carrier.fctrNone
## cellular.fctrUnknown:carrier.fctrNone cellular.fctrUnknown:carrier.fctrNone
## cellular.fctr0:carrier.fctrSprint cellular.fctr0:carrier.fctrSprint
## cellular.fctr1:carrier.fctrSprint cellular.fctr1:carrier.fctrSprint
## cellular.fctrUnknown:carrier.fctrSprint cellular.fctrUnknown:carrier.fctrSprint
## cellular.fctr0:carrier.fctrT-Mobile cellular.fctr0:carrier.fctrT-Mobile
## cellular.fctr1:carrier.fctrT-Mobile cellular.fctr1:carrier.fctrT-Mobile
## cellular.fctrUnknown:carrier.fctrT-Mobile cellular.fctrUnknown:carrier.fctrT-Mobile
## cellular.fctr0:carrier.fctrUnknown cellular.fctr0:carrier.fctrUnknown
## cellular.fctr1:carrier.fctrUnknown cellular.fctr1:carrier.fctrUnknown
## cellular.fctrUnknown:carrier.fctrUnknown cellular.fctrUnknown:carrier.fctrUnknown
## cellular.fctr0:carrier.fctrVerizon cellular.fctr0:carrier.fctrVerizon
## cellular.fctrUnknown:carrier.fctrVerizon cellular.fctrUnknown:carrier.fctrVerizon
## prdl.descr.my.fctrUnknown#0:.clusterid.fctr2 prdl.descr.my.fctrUnknown#0:.clusterid.fctr2
## prdl.descr.my.fctrUnknown#1:.clusterid.fctr2 prdl.descr.my.fctrUnknown#1:.clusterid.fctr2
## prdl.descr.my.fctriPad1#0:.clusterid.fctr2 prdl.descr.my.fctriPad1#0:.clusterid.fctr2
## prdl.descr.my.fctriPad1#1:.clusterid.fctr2 prdl.descr.my.fctriPad1#1:.clusterid.fctr2
## prdl.descr.my.fctriPad2#0:.clusterid.fctr2 prdl.descr.my.fctriPad2#0:.clusterid.fctr2
## prdl.descr.my.fctriPad2#1:.clusterid.fctr2 prdl.descr.my.fctriPad2#1:.clusterid.fctr2
## prdl.descr.my.fctriPad3#0:.clusterid.fctr2 prdl.descr.my.fctriPad3#0:.clusterid.fctr2
## prdl.descr.my.fctriPad3#1:.clusterid.fctr2 prdl.descr.my.fctriPad3#1:.clusterid.fctr2
## prdl.descr.my.fctriPad4#0:.clusterid.fctr2 prdl.descr.my.fctriPad4#0:.clusterid.fctr2
## prdl.descr.my.fctriPad4#1:.clusterid.fctr2 prdl.descr.my.fctriPad4#1:.clusterid.fctr2
## prdl.descr.my.fctriPadAir#0:.clusterid.fctr2 prdl.descr.my.fctriPadAir#0:.clusterid.fctr2
## prdl.descr.my.fctriPadAir#1:.clusterid.fctr2 prdl.descr.my.fctriPadAir#1:.clusterid.fctr2
## prdl.descr.my.fctriPadAir2#0:.clusterid.fctr2 prdl.descr.my.fctriPadAir2#0:.clusterid.fctr2
## prdl.descr.my.fctriPadAir2#1:.clusterid.fctr2 prdl.descr.my.fctriPadAir2#1:.clusterid.fctr2
## prdl.descr.my.fctriPadmini#0:.clusterid.fctr2 prdl.descr.my.fctriPadmini#0:.clusterid.fctr2
## prdl.descr.my.fctriPadmini#1:.clusterid.fctr2 prdl.descr.my.fctriPadmini#1:.clusterid.fctr2
## prdl.descr.my.fctriPadmini2#0:.clusterid.fctr2 prdl.descr.my.fctriPadmini2#0:.clusterid.fctr2
## prdl.descr.my.fctriPadmini2#1:.clusterid.fctr2 prdl.descr.my.fctriPadmini2#1:.clusterid.fctr2
## prdl.descr.my.fctriPadmini3#0:.clusterid.fctr2 prdl.descr.my.fctriPadmini3#0:.clusterid.fctr2
## prdl.descr.my.fctriPadmini3#1:.clusterid.fctr2 prdl.descr.my.fctriPadmini3#1:.clusterid.fctr2
## prdl.descr.my.fctrUnknown#0:.clusterid.fctr3 prdl.descr.my.fctrUnknown#0:.clusterid.fctr3
## prdl.descr.my.fctrUnknown#1:.clusterid.fctr3 prdl.descr.my.fctrUnknown#1:.clusterid.fctr3
## prdl.descr.my.fctriPad1#0:.clusterid.fctr3 prdl.descr.my.fctriPad1#0:.clusterid.fctr3
## prdl.descr.my.fctriPad1#1:.clusterid.fctr3 prdl.descr.my.fctriPad1#1:.clusterid.fctr3
## prdl.descr.my.fctriPad2#0:.clusterid.fctr3 prdl.descr.my.fctriPad2#0:.clusterid.fctr3
## prdl.descr.my.fctriPad2#1:.clusterid.fctr3 prdl.descr.my.fctriPad2#1:.clusterid.fctr3
## prdl.descr.my.fctriPad3#0:.clusterid.fctr3 prdl.descr.my.fctriPad3#0:.clusterid.fctr3
## prdl.descr.my.fctriPad3#1:.clusterid.fctr3 prdl.descr.my.fctriPad3#1:.clusterid.fctr3
## prdl.descr.my.fctriPad4#0:.clusterid.fctr3 prdl.descr.my.fctriPad4#0:.clusterid.fctr3
## prdl.descr.my.fctriPad4#1:.clusterid.fctr3 prdl.descr.my.fctriPad4#1:.clusterid.fctr3
## prdl.descr.my.fctriPadAir#0:.clusterid.fctr3 prdl.descr.my.fctriPadAir#0:.clusterid.fctr3
## prdl.descr.my.fctriPadAir#1:.clusterid.fctr3 prdl.descr.my.fctriPadAir#1:.clusterid.fctr3
## prdl.descr.my.fctriPadAir2#0:.clusterid.fctr3 prdl.descr.my.fctriPadAir2#0:.clusterid.fctr3
## prdl.descr.my.fctriPadAir2#1:.clusterid.fctr3 prdl.descr.my.fctriPadAir2#1:.clusterid.fctr3
## prdl.descr.my.fctriPadmini#0:.clusterid.fctr3 prdl.descr.my.fctriPadmini#0:.clusterid.fctr3
## prdl.descr.my.fctriPadmini#1:.clusterid.fctr3 prdl.descr.my.fctriPadmini#1:.clusterid.fctr3
## prdl.descr.my.fctriPadmini2#0:.clusterid.fctr3 prdl.descr.my.fctriPadmini2#0:.clusterid.fctr3
## prdl.descr.my.fctriPadmini2#1:.clusterid.fctr3 prdl.descr.my.fctriPadmini2#1:.clusterid.fctr3
## prdl.descr.my.fctriPadmini3#0:.clusterid.fctr3 prdl.descr.my.fctriPadmini3#0:.clusterid.fctr3
## prdl.descr.my.fctriPadmini3#1:.clusterid.fctr3 prdl.descr.my.fctriPadmini3#1:.clusterid.fctr3
## prdl.descr.my.fctrUnknown#0:.clusterid.fctr4 prdl.descr.my.fctrUnknown#0:.clusterid.fctr4
## prdl.descr.my.fctrUnknown#1:.clusterid.fctr4 prdl.descr.my.fctrUnknown#1:.clusterid.fctr4
## prdl.descr.my.fctriPad1#0:.clusterid.fctr4 prdl.descr.my.fctriPad1#0:.clusterid.fctr4
## prdl.descr.my.fctriPad1#1:.clusterid.fctr4 prdl.descr.my.fctriPad1#1:.clusterid.fctr4
## prdl.descr.my.fctriPad2#0:.clusterid.fctr4 prdl.descr.my.fctriPad2#0:.clusterid.fctr4
## prdl.descr.my.fctriPad2#1:.clusterid.fctr4 prdl.descr.my.fctriPad2#1:.clusterid.fctr4
## prdl.descr.my.fctriPad3#0:.clusterid.fctr4 prdl.descr.my.fctriPad3#0:.clusterid.fctr4
## prdl.descr.my.fctriPad3#1:.clusterid.fctr4 prdl.descr.my.fctriPad3#1:.clusterid.fctr4
## prdl.descr.my.fctriPad4#0:.clusterid.fctr4 prdl.descr.my.fctriPad4#0:.clusterid.fctr4
## prdl.descr.my.fctriPad4#1:.clusterid.fctr4 prdl.descr.my.fctriPad4#1:.clusterid.fctr4
## prdl.descr.my.fctriPadAir#0:.clusterid.fctr4 prdl.descr.my.fctriPadAir#0:.clusterid.fctr4
## prdl.descr.my.fctriPadAir#1:.clusterid.fctr4 prdl.descr.my.fctriPadAir#1:.clusterid.fctr4
## prdl.descr.my.fctriPadAir2#0:.clusterid.fctr4 prdl.descr.my.fctriPadAir2#0:.clusterid.fctr4
## prdl.descr.my.fctriPadAir2#1:.clusterid.fctr4 prdl.descr.my.fctriPadAir2#1:.clusterid.fctr4
## prdl.descr.my.fctriPadmini#0:.clusterid.fctr4 prdl.descr.my.fctriPadmini#0:.clusterid.fctr4
## prdl.descr.my.fctriPadmini#1:.clusterid.fctr4 prdl.descr.my.fctriPadmini#1:.clusterid.fctr4
## prdl.descr.my.fctriPadmini2#0:.clusterid.fctr4 prdl.descr.my.fctriPadmini2#0:.clusterid.fctr4
## prdl.descr.my.fctriPadmini2#1:.clusterid.fctr4 prdl.descr.my.fctriPadmini2#1:.clusterid.fctr4
## prdl.descr.my.fctriPadmini3#0:.clusterid.fctr4 prdl.descr.my.fctriPadmini3#0:.clusterid.fctr4
## prdl.descr.my.fctriPadmini3#1:.clusterid.fctr4 prdl.descr.my.fctriPadmini3#1:.clusterid.fctr4
## rel.inf
## spdiff.cut.fctr(-10,-1] 23.641364
## sprice.d20nexp 17.331504
## sprice.log10 13.050485
## spdiff.cut.fctr(10,100] 11.654433
## spdiff.cut.fctr(1,10] 8.783040
## D.chrs.uppr.n.log 4.434168
## D.ratio.weight.sum.wrds.n 3.775051
## D.ratio.wrds.stop.n.wrds.n 2.901726
## storage.fctr64 2.892091
## prdl.descr.my.fctriPadmini2#0 2.837448
## cellular.fctr1:carrier.fctrVerizon 1.682703
## prdl.descr.my.fctriPadAir#0 1.665424
## condition.fctrFor parts or not working 1.491445
## D.chrs.pnct11.n.log 1.411318
## D.chrs.n.log 1.286750
## D.terms.post.stem.n.log 1.161048
## D.terms.post.stop.n.log 0.000000
## D.weight.post.stem.sum 0.000000
## D.weight.post.stop.sum 0.000000
## D.weight.sum 0.000000
## D.wrds.n.log 0.000000
## D.wrds.stop.n.log 0.000000
## D.wrds.unq.n.log 0.000000
## cellular.fctr1 0.000000
## cellular.fctrUnknown 0.000000
## color.fctrGold 0.000000
## color.fctrSpace Gray 0.000000
## color.fctrUnknown 0.000000
## color.fctrWhite 0.000000
## condition.fctrManufacturer refurbished 0.000000
## condition.fctrNew 0.000000
## condition.fctrNew other (see details) 0.000000
## condition.fctrSeller refurbished 0.000000
## prdl.descr.my.fctrUnknown#1 0.000000
## prdl.descr.my.fctriPad1#0 0.000000
## prdl.descr.my.fctriPad1#1 0.000000
## prdl.descr.my.fctriPad2#0 0.000000
## prdl.descr.my.fctriPad2#1 0.000000
## prdl.descr.my.fctriPad3#0 0.000000
## prdl.descr.my.fctriPad3#1 0.000000
## prdl.descr.my.fctriPad4#0 0.000000
## prdl.descr.my.fctriPad4#1 0.000000
## prdl.descr.my.fctriPadAir#1 0.000000
## prdl.descr.my.fctriPadAir2#0 0.000000
## prdl.descr.my.fctriPadAir2#1 0.000000
## prdl.descr.my.fctriPadmini#0 0.000000
## prdl.descr.my.fctriPadmini#1 0.000000
## prdl.descr.my.fctriPadmini2#1 0.000000
## prdl.descr.my.fctriPadmini3#0 0.000000
## prdl.descr.my.fctriPadmini3#1 0.000000
## spdiff.cut.fctr(-100,-10] 0.000000
## spdiff.cut.fctr(-1,0] 0.000000
## spdiff.cut.fctr(0,1] 0.000000
## spdiff.cut.fctr(100,1e+03] 0.000000
## sprice.root2 0.000000
## startprice.dcm1.is9 0.000000
## startprice.dgt2.is9 0.000000
## storage.fctr16 0.000000
## storage.fctr32 0.000000
## storage.fctrUnknown 0.000000
## cellular.fctr0:carrier.fctrNone 0.000000
## cellular.fctr1:carrier.fctrNone 0.000000
## cellular.fctrUnknown:carrier.fctrNone 0.000000
## cellular.fctr0:carrier.fctrSprint 0.000000
## cellular.fctr1:carrier.fctrSprint 0.000000
## cellular.fctrUnknown:carrier.fctrSprint 0.000000
## cellular.fctr0:carrier.fctrT-Mobile 0.000000
## cellular.fctr1:carrier.fctrT-Mobile 0.000000
## cellular.fctrUnknown:carrier.fctrT-Mobile 0.000000
## cellular.fctr0:carrier.fctrUnknown 0.000000
## cellular.fctr1:carrier.fctrUnknown 0.000000
## cellular.fctrUnknown:carrier.fctrUnknown 0.000000
## cellular.fctr0:carrier.fctrVerizon 0.000000
## cellular.fctrUnknown:carrier.fctrVerizon 0.000000
## prdl.descr.my.fctrUnknown#0:.clusterid.fctr2 0.000000
## prdl.descr.my.fctrUnknown#1:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPad1#0:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPad1#1:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPad2#0:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPad2#1:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPad3#0:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPad3#1:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPad4#0:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPad4#1:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPadAir#0:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPadAir#1:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPadAir2#0:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPadAir2#1:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPadmini#0:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPadmini#1:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPadmini2#0:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPadmini2#1:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPadmini3#0:.clusterid.fctr2 0.000000
## prdl.descr.my.fctriPadmini3#1:.clusterid.fctr2 0.000000
## prdl.descr.my.fctrUnknown#0:.clusterid.fctr3 0.000000
## prdl.descr.my.fctrUnknown#1:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPad1#0:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPad1#1:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPad2#0:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPad2#1:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPad3#0:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPad3#1:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPad4#0:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPad4#1:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPadAir#0:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPadAir#1:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPadAir2#0:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPadAir2#1:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPadmini#0:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPadmini#1:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPadmini2#0:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPadmini2#1:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPadmini3#0:.clusterid.fctr3 0.000000
## prdl.descr.my.fctriPadmini3#1:.clusterid.fctr3 0.000000
## prdl.descr.my.fctrUnknown#0:.clusterid.fctr4 0.000000
## prdl.descr.my.fctrUnknown#1:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPad1#0:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPad1#1:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPad2#0:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPad2#1:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPad3#0:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPad3#1:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPad4#0:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPad4#1:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPadAir#0:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPadAir#1:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPadAir2#0:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPadAir2#1:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPadmini#0:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPadmini#1:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPadmini2#0:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPadmini2#1:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPadmini3#0:.clusterid.fctr4 0.000000
## prdl.descr.my.fctriPadmini3#1:.clusterid.fctr4 0.000000
## sold.fctr sold.fctr.predict.RFE.X.gbm.N sold.fctr.predict.RFE.X.gbm.Y
## 1 N 297 127
## 2 Y 30 77
## Prediction
## Reference N Y
## N 297 127
## Y 30 77
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.043315e-01 3.137671e-01 6.635135e-01 7.428574e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 9.999999e-01 1.835821e-14
## sold.fctr sold.fctr.predict.RFE.X.gbm.N sold.fctr.predict.RFE.X.gbm.Y
## 1 N 233 141
## 2 Y 41 69
## Prediction
## Reference N Y
## N 233 141
## Y 41 69
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.239669e-01 1.894737e-01 5.791279e-01 6.672814e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 2.162136e-13
## id
## 1 RFE.X.gbm
## feats
## 1 spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 8.257 0.26
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5548514 0.9882075 0.1214953 0.7596544
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.4951768 0.7978522
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.6635135 0.7428574 0.07317897
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5368984 0.9919786 0.08181818 0.6834346
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.43125 0.6239669
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.5791279 0.6672814 0.1894737
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.01147228 0.08088757
## label step_major step_minor label_minor bgn end
## 7 fit.models_1_RFE.X 2 5 gbm 357.168 369.179
## 8 fit.models_1_RFE.X 2 6 rf 369.179 NA
## elapsed
## 7 12.011
## 8 NA
## [1] "fitting model: RFE.X.rf"
## [1] " indep_vars: spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:gdata':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: mtry
## Length Class Mode
## call 4 -none- call
## type 1 -none- character
## predicted 531 factor numeric
## err.rate 1500 -none- numeric
## confusion 6 -none- numeric
## votes 1062 matrix numeric
## oob.times 531 -none- numeric
## classes 2 -none- character
## importance 134 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 14 -none- list
## y 531 factor numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## xNames 134 -none- character
## problemType 1 -none- character
## tuneValue 1 data.frame list
## obsLevels 2 -none- character
## sold.fctr sold.fctr.predict.RFE.X.rf.N sold.fctr.predict.RFE.X.rf.Y
## 1 N 424 NA
## 2 Y 82 25
## Prediction
## Reference N Y
## N 424 0
## Y 82 25
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.455744e-01 3.274536e-01 8.119850e-01 8.752569e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 3.223495e-03 3.721060e-19
## sold.fctr sold.fctr.predict.RFE.X.rf.Y
## 1 N 374
## 2 Y 110
## Prediction
## Reference N Y
## N 0 374
## Y 0 110
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 2.272727e-01 0.000000e+00 1.906680e-01 2.672469e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 6.857299e-83
## id
## 1 RFE.X.rf
## feats
## 1 spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 5 6.895 1.225
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5 1 0 0.84348
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.1 0.3787879 0.7984934
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.811985 0.8752569 0
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5 1 0 0.6501823
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0 0.3703704 0.2272727
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.190668 0.2672469 0
## label step_major step_minor label_minor bgn end
## 8 fit.models_1_RFE.X 2 6 rf 369.179 379.333
## 9 fit.models_1_RFE.X 2 7 nnet 379.334 NA
## elapsed
## 8 10.155
## 9 NA
## [1] "fitting model: RFE.X.nnet"
## [1] " indep_vars: spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## Loading required package: nnet
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Aggregating results
## Warning in train.default(x, y, weights = w, ...): missing values found in
## aggregated results
## Selecting tuning parameters
## Fitting size = 1, decay = 0 on full training set
## # weights: 137
## initial value 361.868993
## iter 10 value 262.684091
## iter 20 value 218.491154
## iter 30 value 201.555147
## iter 40 value 198.145126
## iter 50 value 198.133628
## final value 198.133602
## converged
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: size
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: decay
## Loading required package: NeuralNetTools
## Warning: replacing previous import by 'scales::alpha' when loading
## 'NeuralNetTools'
## a 134-1-1 network with 137 weights
## options were - entropy fitting
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1
## 102.60 -357.33 103.77 -356.19 325.22 139.93 -262.08 -260.64
## i8->h1 i9->h1 i10->h1 i11->h1 i12->h1 i13->h1 i14->h1 i15->h1
## 346.25 291.38 346.13 -275.42 -198.39 -261.87 43.90 81.31
## i16->h1 i17->h1 i18->h1 i19->h1 i20->h1 i21->h1 i22->h1 i23->h1
## 57.41 144.82 45.46 74.83 -76.52 87.38 -197.00 161.85
## i24->h1 i25->h1 i26->h1 i27->h1 i28->h1 i29->h1 i30->h1 i31->h1
## -127.52 -245.66 -124.49 56.57 -195.90 194.12 -332.08 -42.09
## i32->h1 i33->h1 i34->h1 i35->h1 i36->h1 i37->h1 i38->h1 i39->h1
## 3.09 -30.64 10.99 67.47 272.90 89.99 -97.36 81.66
## i40->h1 i41->h1 i42->h1 i43->h1 i44->h1 i45->h1 i46->h1 i47->h1
## -44.17 -131.64 -128.03 -0.12 110.41 481.73 213.07 85.49
## i48->h1 i49->h1 i50->h1 i51->h1 i52->h1 i53->h1 i54->h1 i55->h1
## 366.84 284.43 -261.58 194.46 21.39 -32.87 -151.27 -10.88
## i56->h1 i57->h1 i58->h1 i59->h1 i60->h1 i61->h1 i62->h1 i63->h1
## -67.57 -8.12 -35.91 -22.95 -21.70 -0.07 0.30 0.20
## i64->h1 i65->h1 i66->h1 i67->h1 i68->h1 i69->h1 i70->h1 i71->h1
## 295.78 -0.03 0.15 27.11 -0.27 -0.29 -215.05 81.62
## i72->h1 i73->h1 i74->h1 i75->h1 i76->h1 i77->h1 i78->h1 i79->h1
## 0.61 174.05 -0.64 0.37 -1.64 -0.63 96.59 -0.13
## i80->h1 i81->h1 i82->h1 i83->h1 i84->h1 i85->h1 i86->h1 i87->h1
## 36.96 0.27 -224.87 0.26 55.79 -0.67 -55.08 0.24
## i88->h1 i89->h1 i90->h1 i91->h1 i92->h1 i93->h1 i94->h1 i95->h1
## 52.98 0.38 29.41 -0.30 -93.61 -0.29 -0.55 0.58
## i96->h1 i97->h1 i98->h1 i99->h1 i100->h1 i101->h1 i102->h1 i103->h1
## -81.21 0.27 276.27 -0.61 11.91 -0.59 63.32 -0.69
## i104->h1 i105->h1 i106->h1 i107->h1 i108->h1 i109->h1 i110->h1 i111->h1
## 5.47 -0.10 13.76 0.35 0.08 0.46 -1.21 -0.60
## i112->h1 i113->h1 i114->h1 i115->h1 i116->h1 i117->h1 i118->h1 i119->h1
## -32.84 0.16 0.50 -0.48 -81.20 -0.52 0.30 -0.18
## i120->h1 i121->h1 i122->h1 i123->h1 i124->h1 i125->h1 i126->h1 i127->h1
## 0.27 0.61 0.12 -0.05 -0.61 -0.07 18.72 0.02
## i128->h1 i129->h1 i130->h1 i131->h1 i132->h1 i133->h1 i134->h1
## 0.67 -0.10 4.63 0.32 -0.32 0.44 -0.31
## b->o h1->o
## -2.48 2.81
## sold.fctr sold.fctr.predict.RFE.X.nnet.N sold.fctr.predict.RFE.X.nnet.Y
## 1 N 369 55
## 2 Y 31 76
## Prediction
## Reference N Y
## N 369 55
## Y 31 76
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.83804143 0.53565037 0.80389928 0.86836463 0.79849341
## AccuracyPValue McnemarPValue
## 0.01176318 0.01313257
## sold.fctr sold.fctr.predict.RFE.X.nnet.Y
## 1 N 374
## 2 Y 110
## Prediction
## Reference N Y
## N 0 374
## Y 0 110
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 2.272727e-01 0.000000e+00 1.906680e-01 2.672469e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 6.857299e-83
## id
## 1 RFE.X.nnet
## feats
## 1 spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 26.173 0.115
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.786788 0.8726415 0.7009346 0.7898298
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.4 0.6386555 0.8022536
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.8038993 0.8683646 0.04244521
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.581016 0.7620321 0.4 0.581806
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0 0.3703704 0.2272727
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.190668 0.2672469 0
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.008122698 0.05474545
## label step_major step_minor label_minor bgn end
## 9 fit.models_1_RFE.X 2 7 nnet 379.334 412.125
## 10 fit.models_1_RFE.X 2 8 avNNet 412.126 NA
## elapsed
## 9 32.791
## 10 NA
## [1] "fitting model: RFE.X.avNNet"
## [1] " indep_vars: spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Aggregating results
## Warning in train.default(x, y, weights = w, ...): missing values found in
## aggregated results
## Selecting tuning parameters
## Fitting size = 5, decay = 1e-04, bag = FALSE on full training set
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: bag
## Length Class Mode
## model 5 -none- list
## repeats 1 -none- numeric
## bag 1 -none- logical
## names 134 -none- character
## terms 3 terms call
## coefnames 134 -none- character
## xlevels 0 -none- list
## xNames 134 -none- character
## problemType 1 -none- character
## tuneValue 3 data.frame list
## obsLevels 2 -none- character
## sold.fctr sold.fctr.predict.RFE.X.avNNet.N
## 1 N 407
## 2 Y 25
## sold.fctr.predict.RFE.X.avNNet.Y
## 1 17
## 2 82
## Prediction
## Reference N Y
## N 407 17
## Y 25 82
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.209040e-01 7.471429e-01 8.945873e-01 9.424022e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 5.639234e-15 2.800872e-01
## sold.fctr sold.fctr.predict.RFE.X.avNNet.N
## 1 N 315
## 2 Y 66
## sold.fctr.predict.RFE.X.avNNet.Y
## 1 59
## 2 44
## Prediction
## Reference N Y
## N 315 59
## Y 66 44
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7417355 0.2478118 0.7003064 0.7801824 0.7727273
## AccuracyPValue McnemarPValue
## 0.9520055 0.5915050
## id
## 1 RFE.X.avNNet
## feats
## 1 spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 72.276 1.044
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.7522924 0.990566 0.5140187 0.9486863
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.3 0.7961165 0.8003774
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.8945873 0.9424022 0.04399954
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.581016 0.9438503 0.2181818 0.6544361
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.3 0.4131455 0.7417355
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.7003064 0.7801824 0.2478118
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.005559387 0.05057841
## label step_major step_minor label_minor bgn end
## 10 fit.models_1_RFE.X 2 8 avNNet 412.126 488.419
## 11 fit.models_1_RFE.X 2 9 earth 488.419 NA
## elapsed
## 10 76.293
## 11 NA
## [1] "fitting model: RFE.X.earth"
## [1] " indep_vars: spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## Loading required package: earth
## Loading required package: plotmo
## Loading required package: plotrix
##
## Attaching package: 'plotrix'
##
## The following object is masked from 'package:arm':
##
## rescale
##
## The following object is masked from 'package:gplots':
##
## plotCI
##
## Loading required package: TeachingDemos
## Aggregating results
## Selecting tuning parameters
## Fitting nprune = 2, degree = 1 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: nprune
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: degree
## Call: earth(x=matrix[531,134], y=factor.object, keepxy=TRUE,
## glm=list(family=function.object), degree=1, nprune=2)
##
## GLM coefficients
## Y
## (Intercept) -1.544589
## spdiff.cut.fctr(-10,-1] 1.544589
##
## Earth selected 2 of 110 terms, and 1 of 134 predictors
## Termination condition: RSq changed by less than 0.001 at 110 terms
## Importance: spdiff.cut.fctr(-10,-1], D.chrs.n.log-unused, ...
## Number of terms at each degree of interaction: 1 1 (additive model)
## Earth GCV 0.1549955 RSS 81.37526 GRSq 0.04033171 RSq 0.04756082
##
## GLM null.deviance 533.6379 (530 dof) deviance 513.0643 (529 dof) iters 4
## sold.fctr sold.fctr.predict.RFE.X.earth.Y
## 1 N 424
## 2 Y 107
## Prediction
## Reference N Y
## N 0 424
## Y 0 107
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 2.015066e-01 0.000000e+00 1.681880e-01 2.381872e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 8.945646e-94
## sold.fctr sold.fctr.predict.RFE.X.earth.Y
## 1 N 374
## 2 Y 110
## Prediction
## Reference N Y
## N 0 374
## Y 0 110
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 2.272727e-01 0.000000e+00 1.906680e-01 2.672469e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 6.857299e-83
## id
## 1 RFE.X.earth
## feats
## 1 spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 5 9.912 1.499
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5733667 0.9504717 0.1962617 0.5733667
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.1 0.3354232 0.7941069
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.168188 0.2381872 0.07123983
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5719251 0.9438503 0.2 0.5719251
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.1 0.3703704 0.2272727
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.190668 0.2672469 0
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.008740955 0.07777638
## label step_major step_minor label_minor bgn end
## 11 fit.models_1_RFE.X 2 9 earth 488.419 502.437
## 12 fit.models_1_All.X 3 0 setup 502.438 NA
## elapsed
## 11 14.018
## 12 NA
## label step_major step_minor label_minor bgn end
## 12 fit.models_1_All.X 3 0 setup 502.438 502.445
## 13 fit.models_1_All.X 3 1 glmnet 502.445 NA
## elapsed
## 12 0.007
## 13 NA
## [1] "fitting model: All.X.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.d20nexp,storage.fctr,D.ratio.wrds.stop.n.wrds.n,D.ratio.weight.sum.wrds.n,.rnorm,startprice.dgt2.is9,D.wrds.stop.n.log,D.weight.sum.stem.stop.Ratio,cellular.fctr,D.weight.post.stop.sum,D.weight.post.stem.sum,D.weight.sum,startprice.dgt1.is9,D.wrds.n.log,D.chrs.uppr.n.log,D.chrs.n.log,D.terms.post.stem.n.log,D.wrds.unq.n.log,prdl.descr.my.fctr,D.terms.post.stop.n.log,D.chrs.pnct11.n.log,startprice.dcm2.is9,color.fctr,D.chrs.pnct13.n.log,startprice.dcm1.is9,condition.fctr,sprice.log10,sprice.root2,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.0377 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 83 -none- numeric
## beta 11537 dgCMatrix S4
## df 83 -none- numeric
## dim 2 -none- numeric
## lambda 83 -none- numeric
## dev.ratio 83 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 139 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -0.8271081607
## D.chrs.pnct11.n.log
## -0.0103224923
## D.terms.post.stem.n.log
## -0.0069752942
## D.terms.post.stop.n.log
## -0.0168454394
## D.wrds.unq.n.log
## -0.0009227177
## prdl.descr.my.fctriPad4#0
## 0.0576454262
## prdl.descr.my.fctriPadAir#1
## -0.1352704587
## spdiff.cut.fctr(-10,-1]
## 1.0532076919
## spdiff.cut.fctr(-1,0]
## 2.0774440992
## spdiff.cut.fctr(1,10]
## 0.4321274474
## spdiff.cut.fctr(10,100]
## 0.3388087876
## sprice.log10
## -0.0549995002
## sprice.root2
## -0.0292119313
## cellular.fctr1:carrier.fctrUnknown
## -0.0844204922
## prdl.descr.my.fctrUnknown#1:.clusterid.fctr3
## 1.7146623304
## prdl.descr.my.fctriPadmini#1:.clusterid.fctr3
## 0.8074756802
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -0.729511548
## D.chrs.pnct11.n.log
## -0.046380971
## D.terms.post.stem.n.log
## -0.010376680
## D.terms.post.stop.n.log
## -0.020906742
## D.wrds.unq.n.log
## -0.005677139
## prdl.descr.my.fctriPad4#0
## 0.130953904
## prdl.descr.my.fctriPadAir#1
## -0.184144655
## spdiff.cut.fctr(-10,-1]
## 1.111012644
## spdiff.cut.fctr(-1,0]
## 2.223218543
## spdiff.cut.fctr(1,10]
## 0.493784771
## spdiff.cut.fctr(10,100]
## 0.384272149
## spdiff.cut.fctr(100,1e+03]
## -0.049109520
## sprice.log10
## -0.076060913
## sprice.root2
## -0.028474574
## cellular.fctr1:carrier.fctrUnknown
## -0.153836298
## prdl.descr.my.fctrUnknown#1:.clusterid.fctr3
## 1.942581599
## prdl.descr.my.fctriPadmini#1:.clusterid.fctr3
## 1.002825431
## sold.fctr sold.fctr.predict.All.X.glmnet.N
## 1 N 297
## 2 Y 36
## sold.fctr.predict.All.X.glmnet.Y
## 1 127
## 2 71
## Prediction
## Reference N Y
## N 297 127
## Y 36 71
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.930320e-01 2.762098e-01 6.518637e-01 7.320359e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 1.797632e-12
## sold.fctr sold.fctr.predict.All.X.glmnet.N
## 1 N 248
## 2 Y 36
## sold.fctr.predict.All.X.glmnet.Y
## 1 126
## 2 74
## Prediction
## Reference N Y
## N 248 126
## Y 36 74
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.652893e-01 2.605809e-01 6.213030e-01 7.072412e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 2.700298e-12
## id
## 1 All.X.glmnet
## feats
## 1 spdiff.cut.fctr,sprice.d20nexp,storage.fctr,D.ratio.wrds.stop.n.wrds.n,D.ratio.weight.sum.wrds.n,.rnorm,startprice.dgt2.is9,D.wrds.stop.n.log,D.weight.sum.stem.stop.Ratio,cellular.fctr,D.weight.post.stop.sum,D.weight.post.stem.sum,D.weight.sum,startprice.dgt1.is9,D.wrds.n.log,D.chrs.uppr.n.log,D.chrs.n.log,D.terms.post.stem.n.log,D.wrds.unq.n.log,prdl.descr.my.fctr,D.terms.post.stop.n.log,D.chrs.pnct11.n.log,startprice.dcm2.is9,color.fctr,D.chrs.pnct13.n.log,startprice.dcm1.is9,condition.fctr,sprice.log10,sprice.root2,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 21.348 1.014
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5373832 1 0.07476636 0.7644375
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.4655738 0.8041793
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.6518637 0.7320359 0.08122606
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5406417 0.9812834 0.1 0.7120929
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.4774194 0.6652893
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.621303 0.7072412 0.2605809
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.01319367 0.05458352
## label step_major step_minor label_minor bgn
## 13 fit.models_1_All.X 3 1 glmnet 502.445
## 14 fit.models_1_Best.Interact 4 0 setup 527.687
## end elapsed
## 13 527.686 25.241
## 14 NA NA
## label step_major step_minor
## 14 fit.models_1_Best.Interact 4 0
## 15 fit.models_1_Max.cor.Y.rcv.1X1.Interact 4 1
## label_minor bgn end elapsed
## 14 setup 527.687 527.714 0.027
## 15 glmnet 527.715 NA NA
## [1] "fitting model: Max.cor.Y.rcv.1X1.Interact.glmnet"
## [1] " indep_vars: sprice.root2,spdiff.cut.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.0377 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 74 -none- numeric
## beta 592 dgCMatrix S4
## df 74 -none- numeric
## dim 2 -none- numeric
## lambda 74 -none- numeric
## dev.ratio 74 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 8 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## -1.03165954 1.06135700 2.07411021
## spdiff.cut.fctr(1,10] spdiff.cut.fctr(10,100] sprice.root2
## 0.43394590 0.34905684 -0.03670655
## [1] "max lambda < lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## -1.03888845 1.13028806 2.22933838
## spdiff.cut.fctr(1,10] spdiff.cut.fctr(10,100] sprice.root2
## 0.51041350 0.41018734 -0.03819162
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.Interact.glmnet.N
## 1 N 306
## 2 Y 39
## sold.fctr.predict.Max.cor.Y.rcv.1X1.Interact.glmnet.Y
## 1 118
## 2 68
## Prediction
## Reference N Y
## N 306 118
## Y 39 68
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.043315e-01 2.799471e-01 6.635135e-01 7.428574e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 9.999999e-01 4.813174e-10
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.Interact.glmnet.N
## 1 N 254
## 2 Y 36
## sold.fctr.predict.Max.cor.Y.rcv.1X1.Interact.glmnet.Y
## 1 120
## 2 74
## Prediction
## Reference N Y
## N 254 120
## Y 36 74
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.776860e-01 2.771693e-01 6.340242e-01 7.191601e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 9.999994e-01 3.025623e-11
## id feats
## 1 Max.cor.Y.rcv.1X1.Interact.glmnet sprice.root2,spdiff.cut.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 2.23 0.025
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5233645 1 0.04672897 0.7289279
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.4641638 0.8110598
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.6635135 0.7428574 0.09445917
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5187166 0.9919786 0.04545455 0.7148517
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.4868421 0.677686
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.6340242 0.7191601 0.2771693
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.008672284 0.06210202
# Check if other preProcess methods improve model performance
mdl_id <- orderBy(get_model_sel_frmla(), glb_models_df)[1, "id"]
indep_vars_vctr <-
trim(unlist(strsplit(glb_models_df[glb_models_df$id == mdl_id, "feats"], "[,]")))
method <- tail(unlist(strsplit(mdl_id, "[.]")), 1)
mdl_id_pfx <- paste0(head(unlist(strsplit(mdl_id, "[.]")), -1), collapse=".")
for (prePr in glb_preproc_methods) {
# The operations are applied in this order:
# Box-Cox/Yeo-Johnson transformation, centering, scaling, range, imputation, PCA, ICA then spatial sign.
ret_lst <- myfit_mdl(mdl_specs_lst=myinit_mdl_specs_lst(mdl_specs_lst=list(
id.prefix=mdl_id_pfx,
type=glb_model_type, tune.df=glb_tune_models_df,
trainControl.method="repeatedcv",
trainControl.number=glb_rcv_n_folds,
trainControl.repeats=glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method=method, train.preProcess=prePr)),
indep_vars=indep_vars_vctr, rsp_var=glb_rsp_var,
fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df)
}
## [1] "fitting model: Max.cor.Y.rcv.1X1.YeoJohnson.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.root2"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.0377 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 74 -none- numeric
## beta 592 dgCMatrix S4
## df 74 -none- numeric
## dim 2 -none- numeric
## lambda 74 -none- numeric
## dev.ratio 74 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 8 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## -1.01966308 1.06164853 2.07400270
## spdiff.cut.fctr(1,10] spdiff.cut.fctr(10,100] sprice.root2
## 0.43410103 0.34954414 -0.03968527
## [1] "max lambda < lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## -1.02640749 1.13059429 2.22923357
## spdiff.cut.fctr(1,10] spdiff.cut.fctr(10,100] sprice.root2
## 0.51057693 0.41069757 -0.04129085
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.YeoJohnson.glmnet.N
## 1 N 306
## 2 Y 39
## sold.fctr.predict.Max.cor.Y.rcv.1X1.YeoJohnson.glmnet.Y
## 1 118
## 2 68
## Prediction
## Reference N Y
## N 306 118
## Y 39 68
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.043315e-01 2.799471e-01 6.635135e-01 7.428574e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 9.999999e-01 4.813174e-10
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.YeoJohnson.glmnet.N
## 1 N 254
## 2 Y 36
## sold.fctr.predict.Max.cor.Y.rcv.1X1.YeoJohnson.glmnet.Y
## 1 120
## 2 74
## Prediction
## Reference N Y
## N 254 120
## Y 36 74
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.776860e-01 2.771693e-01 6.340242e-01 7.191601e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 9.999994e-01 3.025623e-11
## id feats
## 1 Max.cor.Y.rcv.1X1.YeoJohnson.glmnet spdiff.cut.fctr,sprice.root2
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 2.58 0.049
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5233645 1 0.04672897 0.7288177
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.4641638 0.8110598
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.6635135 0.7428574 0.09445917
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5187166 0.9919786 0.04545455 0.7150705
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.4868421 0.677686
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.6340242 0.7191601 0.2771693
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.008672284 0.06210202
## [1] "fitting model: Max.cor.Y.rcv.1X1.center.scale.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.root2"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.0377 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 74 -none- numeric
## beta 592 dgCMatrix S4
## df 74 -none- numeric
## dim 2 -none- numeric
## lambda 74 -none- numeric
## dev.ratio 74 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 8 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## -1.4315777 0.2867183 0.2005047
## spdiff.cut.fctr(1,10] spdiff.cut.fctr(10,100] sprice.root2
## 0.1105897 0.1355401 -0.1879626
## [1] "max lambda < lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## -1.4394786 0.3053395 0.2155106
## spdiff.cut.fctr(1,10] spdiff.cut.fctr(10,100] sprice.root2
## 0.1300772 0.1592774 -0.1955672
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.center.scale.glmnet.N
## 1 N 306
## 2 Y 39
## sold.fctr.predict.Max.cor.Y.rcv.1X1.center.scale.glmnet.Y
## 1 118
## 2 68
## Prediction
## Reference N Y
## N 306 118
## Y 39 68
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.043315e-01 2.799471e-01 6.635135e-01 7.428574e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 9.999999e-01 4.813174e-10
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.center.scale.glmnet.N
## 1 N 254
## 2 Y 36
## sold.fctr.predict.Max.cor.Y.rcv.1X1.center.scale.glmnet.Y
## 1 120
## 2 74
## Prediction
## Reference N Y
## N 254 120
## Y 36 74
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.776860e-01 2.771693e-01 6.340242e-01 7.191601e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 9.999994e-01 3.025623e-11
## id feats
## 1 Max.cor.Y.rcv.1X1.center.scale.glmnet spdiff.cut.fctr,sprice.root2
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 2.308 0.031
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5233645 1 0.04672897 0.7289279
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.4641638 0.8110598
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.6635135 0.7428574 0.09445917
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5187166 0.9919786 0.04545455 0.7148517
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.4868421 0.677686
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.6340242 0.7191601 0.2771693
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.008672284 0.06210202
## [1] "fitting model: Max.cor.Y.rcv.1X1.range.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.root2"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.0377 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 74 -none- numeric
## beta 592 dgCMatrix S4
## df 74 -none- numeric
## dim 2 -none- numeric
## lambda 74 -none- numeric
## dev.ratio 74 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 8 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## -1.1049808 1.0613570 2.0741102
## spdiff.cut.fctr(1,10] spdiff.cut.fctr(10,100] sprice.root2
## 0.4339459 0.3490568 -1.0574430
## [1] "max lambda < lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## -1.1151762 1.1302881 2.2293384
## spdiff.cut.fctr(1,10] spdiff.cut.fctr(10,100] sprice.root2
## 0.5104135 0.4101873 -1.1002250
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.range.glmnet.N
## 1 N 306
## 2 Y 39
## sold.fctr.predict.Max.cor.Y.rcv.1X1.range.glmnet.Y
## 1 118
## 2 68
## Prediction
## Reference N Y
## N 306 118
## Y 39 68
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.043315e-01 2.799471e-01 6.635135e-01 7.428574e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 9.999999e-01 4.813174e-10
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.range.glmnet.N
## 1 N 254
## 2 Y 36
## sold.fctr.predict.Max.cor.Y.rcv.1X1.range.glmnet.Y
## 1 120
## 2 74
## Prediction
## Reference N Y
## N 254 120
## Y 36 74
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.776860e-01 2.771693e-01 6.340242e-01 7.191601e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 9.999994e-01 3.025623e-11
## id feats
## 1 Max.cor.Y.rcv.1X1.range.glmnet spdiff.cut.fctr,sprice.root2
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 2.321 0.029
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5233645 1 0.04672897 0.7289279
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.4641638 0.8110598
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.6635135 0.7428574 0.09445917
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5187166 0.9919786 0.04545455 0.7148517
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.4868421 0.677686
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.6340242 0.7191601 0.2771693
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.008672284 0.06210202
## [1] "fitting model: Max.cor.Y.rcv.1X1.pca.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.root2"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.775, lambda = 0.00166 on full training set
## Length Class Mode
## a0 53 -none- numeric
## beta 371 dgCMatrix S4
## df 53 -none- numeric
## dim 2 -none- numeric
## lambda 53 -none- numeric
## dev.ratio 53 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 7 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept) PC1 PC2 PC3 PC4 PC5
## -1.58463602 0.36334003 -0.04802493 0.34101951 0.15957914 0.49691185
## PC6 PC7
## 0.04329326 -0.61140183
## [1] "max lambda < lambdaOpt:"
## (Intercept) PC1 PC2 PC3 PC4 PC5
## -1.58563929 0.36372677 -0.04911213 0.34226246 0.16096850 0.50036988
## PC6 PC7
## 0.04451447 -0.61330850
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.pca.glmnet.N
## 1 N 299
## 2 Y 39
## sold.fctr.predict.Max.cor.Y.rcv.1X1.pca.glmnet.Y
## 1 125
## 2 68
## Prediction
## Reference N Y
## N 299 125
## Y 39 68
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.911488e-01 2.619875e-01 6.499247e-01 7.302296e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 3.192972e-11
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.pca.glmnet.N
## 1 N 241
## 2 Y 37
## sold.fctr.predict.Max.cor.Y.rcv.1X1.pca.glmnet.Y
## 1 133
## 2 73
## Prediction
## Reference N Y
## N 241 133
## Y 37 73
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.487603e-01 2.354865e-01 6.043915e-01 6.912989e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 3.188967e-13
## id feats
## 1 Max.cor.Y.rcv.1X1.pca.glmnet spdiff.cut.fctr,sprice.root2
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 20 2.093 0.02
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5723638 0.9858491 0.1588785 0.7314407
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.4533333 0.8114577
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.6499247 0.7302296 0.1854921
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5529412 0.9786096 0.1272727 0.7029655
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.4620253 0.6487603
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.6043915 0.6912989 0.2354865
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.01482719 0.08399142
## [1] "fitting model: Max.cor.Y.rcv.1X1.ica.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.root2"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.1, lambda = 0.0324 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: alpha
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 58 -none- numeric
## beta 174 dgCMatrix S4
## df 58 -none- numeric
## dim 2 -none- numeric
## lambda 58 -none- numeric
## dev.ratio 58 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 3 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept) ICA1 ICA2 ICA3
## -1.4465092 -0.1090801 0.3788434 0.2936373
## [1] "max lambda < lambdaOpt:"
## (Intercept) ICA1 ICA2 ICA3
## -1.4491494 -0.1125012 0.3867320 0.2980896
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.ica.glmnet.N
## 1 N 281
## 2 Y 37
## sold.fctr.predict.Max.cor.Y.rcv.1X1.ica.glmnet.Y
## 1 143
## 2 70
## Prediction
## Reference N Y
## N 281 143
## Y 37 70
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.610169e-01 2.312889e-01 6.190023e-01 7.012275e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 5.026856e-15
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.ica.glmnet.N
## 1 N 236
## 2 Y 30
## sold.fctr.predict.Max.cor.Y.rcv.1X1.ica.glmnet.Y
## 1 138
## 2 80
## Prediction
## Reference N Y
## N 236 138
## Y 30 80
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.528926e-01 2.660842e-01 6.086141e-01 6.952898e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 1.516110e-16
## id feats
## 1 Max.cor.Y.rcv.1X1.ica.glmnet spdiff.cut.fctr,sprice.root2
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 20 2.27 0.021
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5 1 0 0.6593414
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.4375 0.8010051
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.6190023 0.7012275 0.01920281
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5 1 0 0.6876276
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.4878049 0.6528926
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.6086141 0.6952898 0.2660842
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.003634218 0.03126535
## [1] "fitting model: Max.cor.Y.rcv.1X1.spatialSign.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.root2"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 1, lambda = 0.0434 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: alpha
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 77 -none- numeric
## beta 616 dgCMatrix S4
## df 77 -none- numeric
## dim 2 -none- numeric
## lambda 77 -none- numeric
## dev.ratio 77 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 8 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## -1.2414931 0.8770002 1.8836996
## spdiff.cut.fctr(1,10] spdiff.cut.fctr(10,100] sprice.root2
## 0.2549696 0.2180840 -0.1385209
## [1] "max lambda < lambdaOpt:"
## (Intercept) spdiff.cut.fctr(-10,-1] spdiff.cut.fctr(-1,0]
## -1.2321881 0.9253330 2.0138734
## spdiff.cut.fctr(1,10] spdiff.cut.fctr(10,100] sprice.root2
## 0.3156959 0.2643316 -0.1689024
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.spatialSign.glmnet.N
## 1 N 286
## 2 Y 34
## sold.fctr.predict.Max.cor.Y.rcv.1X1.spatialSign.glmnet.Y
## 1 138
## 2 73
## Prediction
## Reference N Y
## N 286 138
## Y 34 73
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.760829e-01 2.616892e-01 6.344401e-01 7.157520e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 4.040160e-15
## sold.fctr sold.fctr.predict.Max.cor.Y.rcv.1X1.spatialSign.glmnet.N
## 1 N 236
## 2 Y 27
## sold.fctr.predict.Max.cor.Y.rcv.1X1.spatialSign.glmnet.Y
## 1 138
## 2 83
## Prediction
## Reference N Y
## N 236 138
## Y 27 83
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.590909e-01 2.843060e-01 6.149546e-01 7.012695e-01 7.727273e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 1.095029e-17
## id feats
## 1 Max.cor.Y.rcv.1X1.spatialSign.glmnet spdiff.cut.fctr,sprice.root2
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 2.475 0.041
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5233645 1 0.04672897 0.736885
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.4591195 0.8079246
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.6344401 0.715752 0.07213928
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.5187166 0.9919786 0.04545455 0.7264706
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.2 0.5015106 0.6590909
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.6149546 0.7012695 0.284306
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.00639014 0.04269829
# If (All|RFE).X.glm is less accurate than Low.Cor.X.glm
# check NA coefficients & filter appropriate terms in indep_vars_vctr
# if (method == "glm") {
# orig_glm <- glb_models_lst[[paste0(mdl_id, ".", model_method)]]$finalModel
# orig_glm <- glb_models_lst[["All.X.glm"]]$finalModel; print(summary(orig_glm))
# orig_glm <- glb_models_lst[["RFE.X.glm"]]$finalModel; print(summary(orig_glm))
# require(car)
# vif_orig_glm <- vif(orig_glm); print(vif_orig_glm)
# # if vif errors out with "there are aliased coefficients in the model"
# alias_orig_glm <- alias(orig_glm); alias_complete_orig_glm <- (alias_orig_glm$Complete > 0); alias_complete_orig_glm <- alias_complete_orig_glm[rowSums(alias_complete_orig_glm) > 0, colSums(alias_complete_orig_glm) > 0]; print(alias_complete_orig_glm)
# print(vif_orig_glm[!is.na(vif_orig_glm) & (vif_orig_glm == Inf)])
# print(which.max(vif_orig_glm))
# print(sort(vif_orig_glm[vif_orig_glm >= 1.0e+03], decreasing=TRUE))
# glb_fitobs_df[c(1143, 3637, 3953, 4105), c("UniqueID", "Popular", "H.P.quandary", "Headline")]
# glb_feats_df[glb_feats_df$id %in% grep("[HSA]\\.chrs.n.log", glb_feats_df$id, value=TRUE) | glb_feats_df$cor.high.X %in% grep("[HSA]\\.chrs.n.log", glb_feats_df$id, value=TRUE), ]
# all.equal(glb_allobs_df$S.chrs.uppr.n.log, glb_allobs_df$A.chrs.uppr.n.log)
# cor(glb_allobs_df$S.T.herald, glb_allobs_df$S.T.tribun)
# mydsp_obs(Abstract.contains="[Dd]iar", cols=("Abstract"), all=TRUE)
# subset(glb_feats_df, cor.y.abs <= glb_feats_df[glb_feats_df$id == ".rnorm", "cor.y.abs"])
# corxx_mtrx <- cor(data.matrix(glb_allobs_df[, setdiff(names(glb_allobs_df), myfind_chr_cols_df(glb_allobs_df))]), use="pairwise.complete.obs"); abs_corxx_mtrx <- abs(corxx_mtrx); diag(abs_corxx_mtrx) <- 0
# which.max(abs_corxx_mtrx["S.T.tribun", ])
# abs_corxx_mtrx["A.npnct08.log", "S.npnct08.log"]
# step_glm <- step(orig_glm)
# }
# Since caret does not optimize rpart well
# if (method == "rpart")
# ret_lst <- myfit_mdl(mdl_id=paste0(mdl_id_pfx, ".cp.0"), model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# model_type=glb_model_type,
# rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
# fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
# n_cv_folds=0, tune_models_df=data.frame(parameter="cp", min=0.0, max=0.0, by=0.1))
# User specified
# Ensure at least 2 vars in each regression; else varImp crashes
# sav_models_lst <- glb_models_lst; sav_models_df <- glb_models_df; sav_featsimp_df <- glb_featsimp_df; all.equal(sav_featsimp_df, glb_featsimp_df)
# glb_models_lst <- sav_models_lst; glb_models_df <- sav_models_df; glm_featsimp_df <- sav_featsimp_df
# easier to exclude features
# require(gdata) # needed for trim
# mdl_id <- "";
# indep_vars_vctr <- head(subset(glb_models_df, grepl("All\\.X\\.", mdl_id), select=feats)
# , 1)[, "feats"]
# indep_vars_vctr <- trim(unlist(strsplit(indep_vars_vctr, "[,]")))
# indep_vars_vctr <- setdiff(indep_vars_vctr, ".rnorm")
# easier to include features
#stop(here"); sav_models_df <- glb_models_df; glb_models_df <- sav_models_df
# !_sp
# mdl_id <- "csm"; indep_vars_vctr <- c(NULL
# ,"prdline.my.fctr", "prdline.my.fctr:.clusterid.fctr"
# ,"prdline.my.fctr*biddable"
# #,"prdline.my.fctr*startprice.log"
# #,"prdline.my.fctr*startprice.diff"
# ,"prdline.my.fctr*condition.fctr"
# ,"prdline.my.fctr*D.terms.post.stop.n"
# #,"prdline.my.fctr*D.terms.post.stem.n"
# ,"prdline.my.fctr*cellular.fctr"
# # ,"<feat1>:<feat2>"
# )
# for (method in glbMdlMethods) {
# ret_lst <- myfit_mdl(mdl_id=mdl_id, model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# model_type=glb_model_type,
# rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
# fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
# n_cv_folds=glb_rcv_n_folds, tune_models_df=glb_tune_models_df)
# csm_mdl_id <- paste0(mdl_id, ".", method)
# csm_featsimp_df <- myget_feats_importance(glb_models_lst[[paste0(mdl_id, ".",
# method)]]); print(head(csm_featsimp_df))
# }
###
# Ntv.1.lm <- lm(reformulate(indep_vars_vctr, glb_rsp_var), glb_trnobs_df); print(summary(Ntv.1.lm))
#glb_models_df[, "max.Accuracy.OOB", FALSE]
#varImp(glb_models_lst[["Low.cor.X.glm"]])
#orderBy(~ -Overall, varImp(glb_models_lst[["All.X.2.glm"]])$importance)
#orderBy(~ -Overall, varImp(glb_models_lst[["All.X.3.glm"]])$importance)
#glb_feats_df[grepl("npnct28", glb_feats_df$id), ]
# User specified bivariate models
# indep_vars_vctr_lst <- list()
# for (feat in setdiff(names(glb_fitobs_df),
# union(glb_rsp_var, glbFeatsExclude)))
# indep_vars_vctr_lst[["feat"]] <- feat
# User specified combinatorial models
# indep_vars_vctr_lst <- list()
# combn_mtrx <- combn(c("<feat1_name>", "<feat2_name>", "<featn_name>"),
# <num_feats_to_choose>)
# for (combn_ix in 1:ncol(combn_mtrx))
# #print(combn_mtrx[, combn_ix])
# indep_vars_vctr_lst[[combn_ix]] <- combn_mtrx[, combn_ix]
# template for myfit_mdl
# rf is hard-coded in caret to recognize only Accuracy / Kappa evaluation metrics
# only for OOB in trainControl ?
# ret_lst <- myfit_mdl_fn(mdl_id=paste0(mdl_id_pfx, ""), model_method=method,
# indep_vars_vctr=indep_vars_vctr,
# rsp_var=glb_rsp_var, rsp_var_out=glb_rsp_var_out,
# fit_df=glb_fitobs_df, OOB_df=glb_OOBobs_df,
# n_cv_folds=glb_rcv_n_folds, tune_models_df=glb_tune_models_df,
# model_loss_mtrx=glbMdlMetric_terms,
# model_summaryFunction=glbMdlMetricSummaryFn,
# model_metric=glbMdlMetricSummary,
# model_metric_maximize=glbMdlMetricMaximize)
# Simplify a model
# fit_df <- glb_fitobs_df; glb_mdl <- step(<complex>_mdl)
# Non-caret models
# rpart_area_mdl <- rpart(reformulate("Area", response=glb_rsp_var),
# data=glb_fitobs_df, #method="class",
# control=rpart.control(cp=0.12),
# parms=list(loss=glbMdlMetric_terms))
# print("rpart_sel_wlm_mdl"); prp(rpart_sel_wlm_mdl)
#
print(glb_models_df)
## id
## MFO.myMFO_classfr MFO.myMFO_classfr
## Random.myrandom_classfr Random.myrandom_classfr
## Max.cor.Y.rcv.1X1.glmnet Max.cor.Y.rcv.1X1.glmnet
## Max.cor.Y.rcv.3X1.glmnet Max.cor.Y.rcv.3X1.glmnet
## Max.cor.Y.rcv.3X3.glmnet Max.cor.Y.rcv.3X3.glmnet
## Max.cor.Y.rcv.3X5.glmnet Max.cor.Y.rcv.3X5.glmnet
## Max.cor.Y.rcv.5X1.glmnet Max.cor.Y.rcv.5X1.glmnet
## Max.cor.Y.rcv.5X3.glmnet Max.cor.Y.rcv.5X3.glmnet
## Max.cor.Y.rcv.5X5.glmnet Max.cor.Y.rcv.5X5.glmnet
## Max.cor.Y.rcv.1X1.cp.0.rpart Max.cor.Y.rcv.1X1.cp.0.rpart
## Max.cor.Y.rpart Max.cor.Y.rpart
## Interact.High.cor.Y.glmnet Interact.High.cor.Y.glmnet
## Low.cor.X.glmnet Low.cor.X.glmnet
## RFE.X.glm RFE.X.glm
## RFE.X.bayesglm RFE.X.bayesglm
## RFE.X.glmnet RFE.X.glmnet
## RFE.X.rpart RFE.X.rpart
## RFE.X.gbm RFE.X.gbm
## RFE.X.rf RFE.X.rf
## RFE.X.nnet RFE.X.nnet
## RFE.X.avNNet RFE.X.avNNet
## RFE.X.earth RFE.X.earth
## All.X.glmnet All.X.glmnet
## Max.cor.Y.rcv.1X1.Interact.glmnet Max.cor.Y.rcv.1X1.Interact.glmnet
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet Max.cor.Y.rcv.1X1.YeoJohnson.glmnet
## Max.cor.Y.rcv.1X1.center.scale.glmnet Max.cor.Y.rcv.1X1.center.scale.glmnet
## Max.cor.Y.rcv.1X1.range.glmnet Max.cor.Y.rcv.1X1.range.glmnet
## Max.cor.Y.rcv.1X1.pca.glmnet Max.cor.Y.rcv.1X1.pca.glmnet
## Max.cor.Y.rcv.1X1.ica.glmnet Max.cor.Y.rcv.1X1.ica.glmnet
## Max.cor.Y.rcv.1X1.spatialSign.glmnet Max.cor.Y.rcv.1X1.spatialSign.glmnet
## feats
## MFO.myMFO_classfr .rnorm
## Random.myrandom_classfr .rnorm
## Max.cor.Y.rcv.1X1.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.3X1.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.3X3.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.3X5.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.5X1.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.5X3.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.5X5.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.1X1.cp.0.rpart spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rpart spdiff.cut.fctr,sprice.root2
## Interact.High.cor.Y.glmnet spdiff.cut.fctr,sprice.root2,spdiff.cut.fctr:D.terms.post.stop.n.log,spdiff.cut.fctr:D.weight.post.stem.sum,spdiff.cut.fctr:D.chrs.n.log,spdiff.cut.fctr:D.terms.post.stem.n.log,spdiff.cut.fctr:startprice.dcm1.is9,spdiff.cut.fctr:sprice.root2
## Low.cor.X.glmnet spdiff.cut.fctr,sprice.d20nexp,storage.fctr,D.ratio.weight.sum.wrds.n,.rnorm,startprice.dgt2.is9,D.wrds.stop.n.log,D.weight.sum.stem.stop.Ratio,cellular.fctr,startprice.dgt1.is9,prdl.descr.my.fctr,D.terms.post.stop.n.log,D.chrs.pnct11.n.log,color.fctr,D.chrs.pnct13.n.log,startprice.dcm1.is9,condition.fctr,sprice.root2,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.glm spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.bayesglm spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.glmnet spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.rpart spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.gbm spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.rf spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.nnet spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.avNNet spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.earth spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## All.X.glmnet spdiff.cut.fctr,sprice.d20nexp,storage.fctr,D.ratio.wrds.stop.n.wrds.n,D.ratio.weight.sum.wrds.n,.rnorm,startprice.dgt2.is9,D.wrds.stop.n.log,D.weight.sum.stem.stop.Ratio,cellular.fctr,D.weight.post.stop.sum,D.weight.post.stem.sum,D.weight.sum,startprice.dgt1.is9,D.wrds.n.log,D.chrs.uppr.n.log,D.chrs.n.log,D.terms.post.stem.n.log,D.wrds.unq.n.log,prdl.descr.my.fctr,D.terms.post.stop.n.log,D.chrs.pnct11.n.log,startprice.dcm2.is9,color.fctr,D.chrs.pnct13.n.log,startprice.dcm1.is9,condition.fctr,sprice.log10,sprice.root2,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## Max.cor.Y.rcv.1X1.Interact.glmnet sprice.root2,spdiff.cut.fctr
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.1X1.center.scale.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.1X1.range.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.1X1.pca.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.1X1.ica.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.1X1.spatialSign.glmnet spdiff.cut.fctr,sprice.root2
## max.nTuningRuns
## MFO.myMFO_classfr 0
## Random.myrandom_classfr 0
## Max.cor.Y.rcv.1X1.glmnet 0
## Max.cor.Y.rcv.3X1.glmnet 25
## Max.cor.Y.rcv.3X3.glmnet 25
## Max.cor.Y.rcv.3X5.glmnet 25
## Max.cor.Y.rcv.5X1.glmnet 25
## Max.cor.Y.rcv.5X3.glmnet 25
## Max.cor.Y.rcv.5X5.glmnet 25
## Max.cor.Y.rcv.1X1.cp.0.rpart 0
## Max.cor.Y.rpart 5
## Interact.High.cor.Y.glmnet 25
## Low.cor.X.glmnet 25
## RFE.X.glm 1
## RFE.X.bayesglm 1
## RFE.X.glmnet 25
## RFE.X.rpart 5
## RFE.X.gbm 25
## RFE.X.rf 5
## RFE.X.nnet 25
## RFE.X.avNNet 25
## RFE.X.earth 5
## All.X.glmnet 25
## Max.cor.Y.rcv.1X1.Interact.glmnet 25
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 25
## Max.cor.Y.rcv.1X1.center.scale.glmnet 25
## Max.cor.Y.rcv.1X1.range.glmnet 25
## Max.cor.Y.rcv.1X1.pca.glmnet 20
## Max.cor.Y.rcv.1X1.ica.glmnet 20
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 25
## min.elapsedtime.everything
## MFO.myMFO_classfr 0.277
## Random.myrandom_classfr 0.287
## Max.cor.Y.rcv.1X1.glmnet 0.722
## Max.cor.Y.rcv.3X1.glmnet 1.469
## Max.cor.Y.rcv.3X3.glmnet 2.097
## Max.cor.Y.rcv.3X5.glmnet 2.487
## Max.cor.Y.rcv.5X1.glmnet 1.685
## Max.cor.Y.rcv.5X3.glmnet 2.469
## Max.cor.Y.rcv.5X5.glmnet 3.077
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.693
## Max.cor.Y.rpart 1.526
## Interact.High.cor.Y.glmnet 11.792
## Low.cor.X.glmnet 9.019
## RFE.X.glm 3.452
## RFE.X.bayesglm 6.118
## RFE.X.glmnet 21.603
## RFE.X.rpart 3.268
## RFE.X.gbm 8.257
## RFE.X.rf 6.895
## RFE.X.nnet 26.173
## RFE.X.avNNet 72.276
## RFE.X.earth 9.912
## All.X.glmnet 21.348
## Max.cor.Y.rcv.1X1.Interact.glmnet 2.230
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 2.580
## Max.cor.Y.rcv.1X1.center.scale.glmnet 2.308
## Max.cor.Y.rcv.1X1.range.glmnet 2.321
## Max.cor.Y.rcv.1X1.pca.glmnet 2.093
## Max.cor.Y.rcv.1X1.ica.glmnet 2.270
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 2.475
## min.elapsedtime.final
## MFO.myMFO_classfr 0.003
## Random.myrandom_classfr 0.002
## Max.cor.Y.rcv.1X1.glmnet 0.032
## Max.cor.Y.rcv.3X1.glmnet 0.026
## Max.cor.Y.rcv.3X3.glmnet 0.024
## Max.cor.Y.rcv.3X5.glmnet 0.026
## Max.cor.Y.rcv.5X1.glmnet 0.025
## Max.cor.Y.rcv.5X3.glmnet 0.024
## Max.cor.Y.rcv.5X5.glmnet 0.024
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.015
## Max.cor.Y.rpart 0.013
## Interact.High.cor.Y.glmnet 1.396
## Low.cor.X.glmnet 0.992
## RFE.X.glm 0.235
## RFE.X.bayesglm 0.468
## RFE.X.glmnet 0.972
## RFE.X.rpart 0.067
## RFE.X.gbm 0.260
## RFE.X.rf 1.225
## RFE.X.nnet 0.115
## RFE.X.avNNet 1.044
## RFE.X.earth 1.499
## All.X.glmnet 1.014
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.025
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.049
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.031
## Max.cor.Y.rcv.1X1.range.glmnet 0.029
## Max.cor.Y.rcv.1X1.pca.glmnet 0.020
## Max.cor.Y.rcv.1X1.ica.glmnet 0.021
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.041
## max.AUCpROC.fit max.Sens.fit
## MFO.myMFO_classfr 0.5000000 1.0000000
## Random.myrandom_classfr 0.5282909 0.8042453
## Max.cor.Y.rcv.1X1.glmnet 0.5653324 0.9811321
## Max.cor.Y.rcv.3X1.glmnet 0.5233645 1.0000000
## Max.cor.Y.rcv.3X3.glmnet 0.5233645 1.0000000
## Max.cor.Y.rcv.3X5.glmnet 0.5233645 1.0000000
## Max.cor.Y.rcv.5X1.glmnet 0.5536722 0.9858491
## Max.cor.Y.rcv.5X3.glmnet 0.5536722 0.9858491
## Max.cor.Y.rcv.5X5.glmnet 0.5536722 0.9858491
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.6667806 0.9410377
## Max.cor.Y.rpart 0.5000000 1.0000000
## Interact.High.cor.Y.glmnet 0.5233645 1.0000000
## Low.cor.X.glmnet 0.5373832 1.0000000
## RFE.X.glm 0.7474431 0.9528302
## RFE.X.bayesglm 0.7252579 0.9551887
## RFE.X.glmnet 0.5373832 1.0000000
## RFE.X.rpart 0.5000000 1.0000000
## RFE.X.gbm 0.5548514 0.9882075
## RFE.X.rf 0.5000000 1.0000000
## RFE.X.nnet 0.7867880 0.8726415
## RFE.X.avNNet 0.7522924 0.9905660
## RFE.X.earth 0.5733667 0.9504717
## All.X.glmnet 0.5373832 1.0000000
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.5233645 1.0000000
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.5233645 1.0000000
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.5233645 1.0000000
## Max.cor.Y.rcv.1X1.range.glmnet 0.5233645 1.0000000
## Max.cor.Y.rcv.1X1.pca.glmnet 0.5723638 0.9858491
## Max.cor.Y.rcv.1X1.ica.glmnet 0.5000000 1.0000000
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.5233645 1.0000000
## max.Spec.fit max.AUCROCR.fit
## MFO.myMFO_classfr 0.00000000 0.5000000
## Random.myrandom_classfr 0.25233645 0.5050586
## Max.cor.Y.rcv.1X1.glmnet 0.14953271 0.7580453
## Max.cor.Y.rcv.3X1.glmnet 0.04672897 0.7289279
## Max.cor.Y.rcv.3X3.glmnet 0.04672897 0.7289279
## Max.cor.Y.rcv.3X5.glmnet 0.04672897 0.7289279
## Max.cor.Y.rcv.5X1.glmnet 0.12149533 0.7585302
## Max.cor.Y.rcv.5X3.glmnet 0.12149533 0.7585302
## Max.cor.Y.rcv.5X5.glmnet 0.12149533 0.7585302
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.39252336 0.7336449
## Max.cor.Y.rpart 0.00000000 0.5000000
## Interact.High.cor.Y.glmnet 0.04672897 0.7465945
## Low.cor.X.glmnet 0.07476636 0.7637762
## RFE.X.glm 0.54205607 0.9143229
## RFE.X.bayesglm 0.49532710 0.8990698
## RFE.X.glmnet 0.07476636 0.7644375
## RFE.X.rpart 0.00000000 0.5000000
## RFE.X.gbm 0.12149533 0.7596544
## RFE.X.rf 0.00000000 0.8434800
## RFE.X.nnet 0.70093458 0.7898298
## RFE.X.avNNet 0.51401869 0.9486863
## RFE.X.earth 0.19626168 0.5733667
## All.X.glmnet 0.07476636 0.7644375
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.04672897 0.7289279
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.04672897 0.7288177
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.04672897 0.7289279
## Max.cor.Y.rcv.1X1.range.glmnet 0.04672897 0.7289279
## Max.cor.Y.rcv.1X1.pca.glmnet 0.15887850 0.7314407
## Max.cor.Y.rcv.1X1.ica.glmnet 0.00000000 0.6593414
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.04672897 0.7368850
## opt.prob.threshold.fit
## MFO.myMFO_classfr 0.2
## Random.myrandom_classfr 0.2
## Max.cor.Y.rcv.1X1.glmnet 0.2
## Max.cor.Y.rcv.3X1.glmnet 0.2
## Max.cor.Y.rcv.3X3.glmnet 0.2
## Max.cor.Y.rcv.3X5.glmnet 0.2
## Max.cor.Y.rcv.5X1.glmnet 0.2
## Max.cor.Y.rcv.5X3.glmnet 0.2
## Max.cor.Y.rcv.5X5.glmnet 0.2
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.3
## Max.cor.Y.rpart 0.2
## Interact.High.cor.Y.glmnet 0.2
## Low.cor.X.glmnet 0.2
## RFE.X.glm 0.3
## RFE.X.bayesglm 0.3
## RFE.X.glmnet 0.2
## RFE.X.rpart 0.2
## RFE.X.gbm 0.2
## RFE.X.rf 0.1
## RFE.X.nnet 0.4
## RFE.X.avNNet 0.3
## RFE.X.earth 0.1
## All.X.glmnet 0.2
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.2
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.2
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.2
## Max.cor.Y.rcv.1X1.range.glmnet 0.2
## Max.cor.Y.rcv.1X1.pca.glmnet 0.2
## Max.cor.Y.rcv.1X1.ica.glmnet 0.2
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.2
## max.f.score.fit max.Accuracy.fit
## MFO.myMFO_classfr 0.3354232 0.2015066
## Random.myrandom_classfr 0.3354232 0.2015066
## Max.cor.Y.rcv.1X1.glmnet 0.4778157 0.7118644
## Max.cor.Y.rcv.3X1.glmnet 0.4641638 0.8098207
## Max.cor.Y.rcv.3X3.glmnet 0.4641638 0.8110598
## Max.cor.Y.rcv.3X5.glmnet 0.4641638 0.8098088
## Max.cor.Y.rcv.5X1.glmnet 0.4778157 0.8154583
## Max.cor.Y.rcv.5X3.glmnet 0.4778157 0.8123155
## Max.cor.Y.rcv.5X5.glmnet 0.4778157 0.8101741
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.5073171 0.8097928
## Max.cor.Y.rpart 0.3354232 0.7897092
## Interact.High.cor.Y.glmnet 0.4810127 0.8098079
## Low.cor.X.glmnet 0.4671053 0.8048071
## RFE.X.glm 0.6693548 0.7407370
## RFE.X.bayesglm 0.6778243 0.7815306
## RFE.X.glmnet 0.4655738 0.8054348
## RFE.X.rpart 0.3354232 0.7941175
## RFE.X.gbm 0.4951768 0.7978522
## RFE.X.rf 0.3787879 0.7984934
## RFE.X.nnet 0.6386555 0.8022536
## RFE.X.avNNet 0.7961165 0.8003774
## RFE.X.earth 0.3354232 0.7941069
## All.X.glmnet 0.4655738 0.8041793
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.4641638 0.8110598
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.4641638 0.8110598
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.4641638 0.8110598
## Max.cor.Y.rcv.1X1.range.glmnet 0.4641638 0.8110598
## Max.cor.Y.rcv.1X1.pca.glmnet 0.4533333 0.8114577
## Max.cor.Y.rcv.1X1.ica.glmnet 0.4375000 0.8010051
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.4591195 0.8079246
## max.AccuracyLower.fit
## MFO.myMFO_classfr 0.1681880
## Random.myrandom_classfr 0.1681880
## Max.cor.Y.rcv.1X1.glmnet 0.6712958
## Max.cor.Y.rcv.3X1.glmnet 0.6635135
## Max.cor.Y.rcv.3X3.glmnet 0.6635135
## Max.cor.Y.rcv.3X5.glmnet 0.6635135
## Max.cor.Y.rcv.5X1.glmnet 0.6712958
## Max.cor.Y.rcv.5X3.glmnet 0.6712958
## Max.cor.Y.rcv.5X5.glmnet 0.6712958
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.7737799
## Max.cor.Y.rpart 0.1681880
## Interact.High.cor.Y.glmnet 0.6499247
## Low.cor.X.glmnet 0.6538034
## RFE.X.glm 0.8119850
## RFE.X.bayesglm 0.8221278
## RFE.X.glmnet 0.6518637
## RFE.X.rpart 0.1681880
## RFE.X.gbm 0.6635135
## RFE.X.rf 0.8119850
## RFE.X.nnet 0.8038993
## RFE.X.avNNet 0.8945873
## RFE.X.earth 0.1681880
## All.X.glmnet 0.6518637
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.6635135
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.6635135
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.6635135
## Max.cor.Y.rcv.1X1.range.glmnet 0.6635135
## Max.cor.Y.rcv.1X1.pca.glmnet 0.6499247
## Max.cor.Y.rcv.1X1.ica.glmnet 0.6190023
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.6344401
## max.AccuracyUpper.fit max.Kappa.fit
## MFO.myMFO_classfr 0.2381872 0.00000000
## Random.myrandom_classfr 0.2381872 0.00000000
## Max.cor.Y.rcv.1X1.glmnet 0.7500560 0.29829244
## Max.cor.Y.rcv.3X1.glmnet 0.7428574 0.08662750
## Max.cor.Y.rcv.3X3.glmnet 0.7428574 0.09445917
## Max.cor.Y.rcv.3X5.glmnet 0.7428574 0.08571096
## Max.cor.Y.rcv.5X1.glmnet 0.7500560 0.16272716
## Max.cor.Y.rcv.5X3.glmnet 0.7500560 0.14648283
## Max.cor.Y.rcv.5X5.glmnet 0.7500560 0.14504375
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.8423139 0.38974546
## Max.cor.Y.rpart 0.2381872 0.06035055
## Interact.High.cor.Y.glmnet 0.7302296 0.08548063
## Low.cor.X.glmnet 0.7338414 0.08253030
## RFE.X.glm 0.8752569 0.20258982
## RFE.X.bayesglm 0.8838361 0.21090324
## RFE.X.glmnet 0.7320359 0.08387615
## RFE.X.rpart 0.2381872 0.07284994
## RFE.X.gbm 0.7428574 0.07317897
## RFE.X.rf 0.8752569 0.00000000
## RFE.X.nnet 0.8683646 0.04244521
## RFE.X.avNNet 0.9424022 0.04399954
## RFE.X.earth 0.2381872 0.07123983
## All.X.glmnet 0.7320359 0.08122606
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.7428574 0.09445917
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.7428574 0.09445917
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.7428574 0.09445917
## Max.cor.Y.rcv.1X1.range.glmnet 0.7428574 0.09445917
## Max.cor.Y.rcv.1X1.pca.glmnet 0.7302296 0.18549208
## Max.cor.Y.rcv.1X1.ica.glmnet 0.7012275 0.01920281
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.7157520 0.07213928
## max.AUCpROC.OOB max.Sens.OOB
## MFO.myMFO_classfr 0.5000000 1.0000000
## Random.myrandom_classfr 0.5168449 0.8155080
## Max.cor.Y.rcv.1X1.glmnet 0.5529412 0.9786096
## Max.cor.Y.rcv.3X1.glmnet 0.5187166 0.9919786
## Max.cor.Y.rcv.3X3.glmnet 0.5187166 0.9919786
## Max.cor.Y.rcv.3X5.glmnet 0.5187166 0.9919786
## Max.cor.Y.rcv.5X1.glmnet 0.5497326 0.9812834
## Max.cor.Y.rcv.5X3.glmnet 0.5497326 0.9812834
## Max.cor.Y.rcv.5X5.glmnet 0.5497326 0.9812834
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.5914439 0.9010695
## Max.cor.Y.rpart 0.5000000 1.0000000
## Interact.High.cor.Y.glmnet 0.5187166 0.9919786
## Low.cor.X.glmnet 0.5406417 0.9812834
## RFE.X.glm 0.6304813 0.8609626
## RFE.X.bayesglm 0.6000000 0.9090909
## RFE.X.glmnet 0.5406417 0.9812834
## RFE.X.rpart 0.5000000 1.0000000
## RFE.X.gbm 0.5368984 0.9919786
## RFE.X.rf 0.5000000 1.0000000
## RFE.X.nnet 0.5810160 0.7620321
## RFE.X.avNNet 0.5810160 0.9438503
## RFE.X.earth 0.5719251 0.9438503
## All.X.glmnet 0.5406417 0.9812834
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.5187166 0.9919786
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.5187166 0.9919786
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.5187166 0.9919786
## Max.cor.Y.rcv.1X1.range.glmnet 0.5187166 0.9919786
## Max.cor.Y.rcv.1X1.pca.glmnet 0.5529412 0.9786096
## Max.cor.Y.rcv.1X1.ica.glmnet 0.5000000 1.0000000
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.5187166 0.9919786
## max.Spec.OOB max.AUCROCR.OOB
## MFO.myMFO_classfr 0.00000000 0.5000000
## Random.myrandom_classfr 0.21818182 0.4962567
## Max.cor.Y.rcv.1X1.glmnet 0.12727273 0.7423189
## Max.cor.Y.rcv.3X1.glmnet 0.04545455 0.7148517
## Max.cor.Y.rcv.3X3.glmnet 0.04545455 0.7148517
## Max.cor.Y.rcv.3X5.glmnet 0.04545455 0.7148517
## Max.cor.Y.rcv.5X1.glmnet 0.11818182 0.7398882
## Max.cor.Y.rcv.5X3.glmnet 0.11818182 0.7398882
## Max.cor.Y.rcv.5X5.glmnet 0.11818182 0.7398882
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.28181818 0.6280870
## Max.cor.Y.rpart 0.00000000 0.5000000
## Interact.High.cor.Y.glmnet 0.04545455 0.7259844
## Low.cor.X.glmnet 0.10000000 0.7107560
## RFE.X.glm 0.40000000 0.7157997
## RFE.X.bayesglm 0.29090909 0.7037555
## RFE.X.glmnet 0.10000000 0.7120929
## RFE.X.rpart 0.00000000 0.5000000
## RFE.X.gbm 0.08181818 0.6834346
## RFE.X.rf 0.00000000 0.6501823
## RFE.X.nnet 0.40000000 0.5818060
## RFE.X.avNNet 0.21818182 0.6544361
## RFE.X.earth 0.20000000 0.5719251
## All.X.glmnet 0.10000000 0.7120929
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.04545455 0.7148517
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.04545455 0.7150705
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.04545455 0.7148517
## Max.cor.Y.rcv.1X1.range.glmnet 0.04545455 0.7148517
## Max.cor.Y.rcv.1X1.pca.glmnet 0.12727273 0.7029655
## Max.cor.Y.rcv.1X1.ica.glmnet 0.00000000 0.6876276
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.04545455 0.7264706
## opt.prob.threshold.OOB
## MFO.myMFO_classfr 0.2
## Random.myrandom_classfr 0.2
## Max.cor.Y.rcv.1X1.glmnet 0.2
## Max.cor.Y.rcv.3X1.glmnet 0.2
## Max.cor.Y.rcv.3X3.glmnet 0.2
## Max.cor.Y.rcv.3X5.glmnet 0.2
## Max.cor.Y.rcv.5X1.glmnet 0.2
## Max.cor.Y.rcv.5X3.glmnet 0.2
## Max.cor.Y.rcv.5X5.glmnet 0.2
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.2
## Max.cor.Y.rpart 0.2
## Interact.High.cor.Y.glmnet 0.2
## Low.cor.X.glmnet 0.2
## RFE.X.glm 0.3
## RFE.X.bayesglm 0.3
## RFE.X.glmnet 0.2
## RFE.X.rpart 0.2
## RFE.X.gbm 0.2
## RFE.X.rf 0.0
## RFE.X.nnet 0.0
## RFE.X.avNNet 0.3
## RFE.X.earth 0.1
## All.X.glmnet 0.2
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.2
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.2
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.2
## Max.cor.Y.rcv.1X1.range.glmnet 0.2
## Max.cor.Y.rcv.1X1.pca.glmnet 0.2
## Max.cor.Y.rcv.1X1.ica.glmnet 0.2
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.2
## max.f.score.OOB max.Accuracy.OOB
## MFO.myMFO_classfr 0.3703704 0.2272727
## Random.myrandom_classfr 0.3703704 0.2272727
## Max.cor.Y.rcv.1X1.glmnet 0.5175719 0.6880165
## Max.cor.Y.rcv.3X1.glmnet 0.4868421 0.6776860
## Max.cor.Y.rcv.3X3.glmnet 0.4868421 0.6776860
## Max.cor.Y.rcv.3X5.glmnet 0.4868421 0.6776860
## Max.cor.Y.rcv.5X1.glmnet 0.5047923 0.6797521
## Max.cor.Y.rcv.5X3.glmnet 0.5047923 0.6797521
## Max.cor.Y.rcv.5X5.glmnet 0.5047923 0.6797521
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.4380165 0.7190083
## Max.cor.Y.rpart 0.3703704 0.2272727
## Interact.High.cor.Y.glmnet 0.4863222 0.6508264
## Low.cor.X.glmnet 0.4789644 0.6673554
## RFE.X.glm 0.4800000 0.7314050
## RFE.X.bayesglm 0.4682540 0.7231405
## RFE.X.glmnet 0.4774194 0.6652893
## RFE.X.rpart 0.3703704 0.2272727
## RFE.X.gbm 0.4312500 0.6239669
## RFE.X.rf 0.3703704 0.2272727
## RFE.X.nnet 0.3703704 0.2272727
## RFE.X.avNNet 0.4131455 0.7417355
## RFE.X.earth 0.3703704 0.2272727
## All.X.glmnet 0.4774194 0.6652893
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.4868421 0.6776860
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.4868421 0.6776860
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.4868421 0.6776860
## Max.cor.Y.rcv.1X1.range.glmnet 0.4868421 0.6776860
## Max.cor.Y.rcv.1X1.pca.glmnet 0.4620253 0.6487603
## Max.cor.Y.rcv.1X1.ica.glmnet 0.4878049 0.6528926
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.5015106 0.6590909
## max.AccuracyLower.OOB
## MFO.myMFO_classfr 0.1906680
## Random.myrandom_classfr 0.1906680
## Max.cor.Y.rcv.1X1.glmnet 0.6446506
## Max.cor.Y.rcv.3X1.glmnet 0.6340242
## Max.cor.Y.rcv.3X3.glmnet 0.6340242
## Max.cor.Y.rcv.3X5.glmnet 0.6340242
## Max.cor.Y.rcv.5X1.glmnet 0.6361476
## Max.cor.Y.rcv.5X3.glmnet 0.6361476
## Max.cor.Y.rcv.5X5.glmnet 0.6361476
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.6766747
## Max.cor.Y.rpart 0.1906680
## Interact.High.cor.Y.glmnet 0.6065024
## Low.cor.X.glmnet 0.6234210
## RFE.X.glm 0.6895485
## RFE.X.bayesglm 0.6809618
## RFE.X.glmnet 0.6213030
## RFE.X.rpart 0.1906680
## RFE.X.gbm 0.5791279
## RFE.X.rf 0.1906680
## RFE.X.nnet 0.1906680
## RFE.X.avNNet 0.7003064
## RFE.X.earth 0.1906680
## All.X.glmnet 0.6213030
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.6340242
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.6340242
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.6340242
## Max.cor.Y.rcv.1X1.range.glmnet 0.6340242
## Max.cor.Y.rcv.1X1.pca.glmnet 0.6043915
## Max.cor.Y.rcv.1X1.ica.glmnet 0.6086141
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.6149546
## max.AccuracyUpper.OOB max.Kappa.OOB
## MFO.myMFO_classfr 0.2672469 0.0000000
## Random.myrandom_classfr 0.2672469 0.0000000
## Max.cor.Y.rcv.1X1.glmnet 0.7290671 0.3158979
## Max.cor.Y.rcv.3X1.glmnet 0.7191601 0.2771693
## Max.cor.Y.rcv.3X3.glmnet 0.7191601 0.2771693
## Max.cor.Y.rcv.3X5.glmnet 0.7191601 0.2771693
## Max.cor.Y.rcv.5X1.glmnet 0.7211434 0.2977759
## Max.cor.Y.rcv.5X3.glmnet 0.7211434 0.2977759
## Max.cor.Y.rcv.5X5.glmnet 0.7211434 0.2977759
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.7586422 0.2527473
## Max.cor.Y.rpart 0.2672469 0.0000000
## Interact.High.cor.Y.glmnet 0.6932948 0.2634707
## Low.cor.X.glmnet 0.7092300 0.2633111
## RFE.X.glm 0.7704077 0.3024390
## RFE.X.bayesglm 0.7625683 0.2851600
## RFE.X.glmnet 0.7072412 0.2605809
## RFE.X.rpart 0.2672469 0.0000000
## RFE.X.gbm 0.6672814 0.1894737
## RFE.X.rf 0.2672469 0.0000000
## RFE.X.nnet 0.2672469 0.0000000
## RFE.X.avNNet 0.7801824 0.2478118
## RFE.X.earth 0.2672469 0.0000000
## All.X.glmnet 0.7072412 0.2605809
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.7191601 0.2771693
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.7191601 0.2771693
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.7191601 0.2771693
## Max.cor.Y.rcv.1X1.range.glmnet 0.7191601 0.2771693
## Max.cor.Y.rcv.1X1.pca.glmnet 0.6912989 0.2354865
## Max.cor.Y.rcv.1X1.ica.glmnet 0.6952898 0.2660842
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.7012695 0.2843060
## max.AccuracySD.fit max.KappaSD.fit
## MFO.myMFO_classfr NA NA
## Random.myrandom_classfr NA NA
## Max.cor.Y.rcv.1X1.glmnet NA NA
## Max.cor.Y.rcv.3X1.glmnet 0.007588213 0.04350633
## Max.cor.Y.rcv.3X3.glmnet 0.008672284 0.06210202
## Max.cor.Y.rcv.3X5.glmnet 0.007730242 0.05270539
## Max.cor.Y.rcv.5X1.glmnet 0.004163174 0.04402859
## Max.cor.Y.rcv.5X3.glmnet 0.013303535 0.07756812
## Max.cor.Y.rcv.5X5.glmnet 0.013669737 0.07197471
## Max.cor.Y.rcv.1X1.cp.0.rpart NA NA
## Max.cor.Y.rpart 0.014803443 0.06208502
## Interact.High.cor.Y.glmnet 0.008044233 0.05798146
## Low.cor.X.glmnet 0.012920083 0.05420123
## RFE.X.glm 0.038199446 0.07904272
## RFE.X.bayesglm 0.026156181 0.08763170
## RFE.X.glmnet 0.012918104 0.05410138
## RFE.X.rpart 0.012603421 0.08825496
## RFE.X.gbm 0.011472280 0.08088757
## RFE.X.rf NA NA
## RFE.X.nnet 0.008122698 0.05474545
## RFE.X.avNNet 0.005559387 0.05057841
## RFE.X.earth 0.008740955 0.07777638
## All.X.glmnet 0.013193666 0.05458352
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.008672284 0.06210202
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.008672284 0.06210202
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.008672284 0.06210202
## Max.cor.Y.rcv.1X1.range.glmnet 0.008672284 0.06210202
## Max.cor.Y.rcv.1X1.pca.glmnet 0.014827190 0.08399142
## Max.cor.Y.rcv.1X1.ica.glmnet 0.003634218 0.03126535
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.006390140 0.04269829
rm(ret_lst)
fit.models_1_chunk_df <- myadd_chunk(fit.models_1_chunk_df, "fit.models_1_end",
major.inc=TRUE, label.minor="teardown")
## label step_major step_minor
## 15 fit.models_1_Max.cor.Y.rcv.1X1.Interact 4 1
## 16 fit.models_1_end 5 0
## label_minor bgn end elapsed
## 15 glmnet 527.715 570.359 42.644
## 16 teardown 570.360 NA NA
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor label_minor bgn end elapsed
## 11 fit.models 6 1 1 298.940 570.368 271.428
## 12 fit.models 6 2 2 570.369 NA NA
#stop(here"); glb_to_sav(); all.equal(glb_models_df, sav_models_df)
# if (!is.null(glbMdlMetricSummaryFn)) {
# stats_df <- glb_models_df[, "id", FALSE]
#
# stats_mdl_df <- data.frame()
# for (mdl_id in stats_df$id) {
# stats_mdl_df <- rbind(stats_mdl_df,
# mypredict_mdl(glb_models_lst[[mdl_id]], glb_fitobs_df, glb_rsp_var,
# glb_rsp_var_out, mdl_id, "fit",
# glbMdlMetricSummaryFn, glbMdlMetricSummary,
# glbMdlMetricMaximize, ret_type="stats"))
# }
# stats_df <- merge(stats_df, stats_mdl_df, all.x=TRUE)
#
# stats_mdl_df <- data.frame()
# for (mdl_id in stats_df$id) {
# stats_mdl_df <- rbind(stats_mdl_df,
# mypredict_mdl(glb_models_lst[[mdl_id]], glb_OOBobs_df, glb_rsp_var,
# glb_rsp_var_out, mdl_id, "OOB",
# glbMdlMetricSummaryFn, glbMdlMetricSummary,
# glbMdlMetricMaximize, ret_type="stats"))
# }
# stats_df <- merge(stats_df, stats_mdl_df, all.x=TRUE)
#
# print("Merging following data into glb_models_df:")
# print(stats_mrg_df <- stats_df[, c(1, grep(glbMdlMetricSummary, names(stats_df)))])
# print(tmp_models_df <- orderBy(~id, glb_models_df[, c("id",
# grep(glbMdlMetricSummary, names(stats_df), value=TRUE))]))
#
# tmp2_models_df <- glb_models_df[, c("id", setdiff(names(glb_models_df),
# grep(glbMdlMetricSummary, names(stats_df), value=TRUE)))]
# tmp3_models_df <- merge(tmp2_models_df, stats_mrg_df, all.x=TRUE, sort=FALSE)
# print(tmp3_models_df)
# print(names(tmp3_models_df))
# print(glb_models_df <- subset(tmp3_models_df, select=-id.1))
# }
plt_models_df <- glb_models_df[, -grep("SD|Upper|Lower", names(glb_models_df))]
for (var in grep("^min.", names(plt_models_df), value=TRUE)) {
plt_models_df[, sub("min.", "inv.", var)] <-
#ifelse(all(is.na(tmp <- plt_models_df[, var])), NA, 1.0 / tmp)
1.0 / plt_models_df[, var]
plt_models_df <- plt_models_df[ , -grep(var, names(plt_models_df))]
}
print(plt_models_df)
## id
## MFO.myMFO_classfr MFO.myMFO_classfr
## Random.myrandom_classfr Random.myrandom_classfr
## Max.cor.Y.rcv.1X1.glmnet Max.cor.Y.rcv.1X1.glmnet
## Max.cor.Y.rcv.3X1.glmnet Max.cor.Y.rcv.3X1.glmnet
## Max.cor.Y.rcv.3X3.glmnet Max.cor.Y.rcv.3X3.glmnet
## Max.cor.Y.rcv.3X5.glmnet Max.cor.Y.rcv.3X5.glmnet
## Max.cor.Y.rcv.5X1.glmnet Max.cor.Y.rcv.5X1.glmnet
## Max.cor.Y.rcv.5X3.glmnet Max.cor.Y.rcv.5X3.glmnet
## Max.cor.Y.rcv.5X5.glmnet Max.cor.Y.rcv.5X5.glmnet
## Max.cor.Y.rcv.1X1.cp.0.rpart Max.cor.Y.rcv.1X1.cp.0.rpart
## Max.cor.Y.rpart Max.cor.Y.rpart
## Interact.High.cor.Y.glmnet Interact.High.cor.Y.glmnet
## Low.cor.X.glmnet Low.cor.X.glmnet
## RFE.X.glm RFE.X.glm
## RFE.X.bayesglm RFE.X.bayesglm
## RFE.X.glmnet RFE.X.glmnet
## RFE.X.rpart RFE.X.rpart
## RFE.X.gbm RFE.X.gbm
## RFE.X.rf RFE.X.rf
## RFE.X.nnet RFE.X.nnet
## RFE.X.avNNet RFE.X.avNNet
## RFE.X.earth RFE.X.earth
## All.X.glmnet All.X.glmnet
## Max.cor.Y.rcv.1X1.Interact.glmnet Max.cor.Y.rcv.1X1.Interact.glmnet
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet Max.cor.Y.rcv.1X1.YeoJohnson.glmnet
## Max.cor.Y.rcv.1X1.center.scale.glmnet Max.cor.Y.rcv.1X1.center.scale.glmnet
## Max.cor.Y.rcv.1X1.range.glmnet Max.cor.Y.rcv.1X1.range.glmnet
## Max.cor.Y.rcv.1X1.pca.glmnet Max.cor.Y.rcv.1X1.pca.glmnet
## Max.cor.Y.rcv.1X1.ica.glmnet Max.cor.Y.rcv.1X1.ica.glmnet
## Max.cor.Y.rcv.1X1.spatialSign.glmnet Max.cor.Y.rcv.1X1.spatialSign.glmnet
## feats
## MFO.myMFO_classfr .rnorm
## Random.myrandom_classfr .rnorm
## Max.cor.Y.rcv.1X1.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.3X1.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.3X3.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.3X5.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.5X1.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.5X3.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.5X5.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.1X1.cp.0.rpart spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rpart spdiff.cut.fctr,sprice.root2
## Interact.High.cor.Y.glmnet spdiff.cut.fctr,sprice.root2,spdiff.cut.fctr:D.terms.post.stop.n.log,spdiff.cut.fctr:D.weight.post.stem.sum,spdiff.cut.fctr:D.chrs.n.log,spdiff.cut.fctr:D.terms.post.stem.n.log,spdiff.cut.fctr:startprice.dcm1.is9,spdiff.cut.fctr:sprice.root2
## Low.cor.X.glmnet spdiff.cut.fctr,sprice.d20nexp,storage.fctr,D.ratio.weight.sum.wrds.n,.rnorm,startprice.dgt2.is9,D.wrds.stop.n.log,D.weight.sum.stem.stop.Ratio,cellular.fctr,startprice.dgt1.is9,prdl.descr.my.fctr,D.terms.post.stop.n.log,D.chrs.pnct11.n.log,color.fctr,D.chrs.pnct13.n.log,startprice.dcm1.is9,condition.fctr,sprice.root2,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.glm spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.bayesglm spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.glmnet spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.rpart spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.gbm spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.rf spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.nnet spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.avNNet spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## RFE.X.earth spdiff.cut.fctr,prdl.descr.my.fctr,sprice.d20nexp,sprice.log10,sprice.root2,condition.fctr,D.terms.post.stop.n.log,D.terms.post.stem.n.log,D.chrs.n.log,D.wrds.unq.n.log,startprice.dcm1.is9,cellular.fctr,D.ratio.weight.sum.wrds.n,D.ratio.wrds.stop.n.wrds.n,D.chrs.uppr.n.log,D.wrds.n.log,D.weight.post.stem.sum,D.weight.sum,D.chrs.pnct11.n.log,D.weight.post.stop.sum,color.fctr,D.wrds.stop.n.log,storage.fctr,startprice.dgt2.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## All.X.glmnet spdiff.cut.fctr,sprice.d20nexp,storage.fctr,D.ratio.wrds.stop.n.wrds.n,D.ratio.weight.sum.wrds.n,.rnorm,startprice.dgt2.is9,D.wrds.stop.n.log,D.weight.sum.stem.stop.Ratio,cellular.fctr,D.weight.post.stop.sum,D.weight.post.stem.sum,D.weight.sum,startprice.dgt1.is9,D.wrds.n.log,D.chrs.uppr.n.log,D.chrs.n.log,D.terms.post.stem.n.log,D.wrds.unq.n.log,prdl.descr.my.fctr,D.terms.post.stop.n.log,D.chrs.pnct11.n.log,startprice.dcm2.is9,color.fctr,D.chrs.pnct13.n.log,startprice.dcm1.is9,condition.fctr,sprice.log10,sprice.root2,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## Max.cor.Y.rcv.1X1.Interact.glmnet sprice.root2,spdiff.cut.fctr
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.1X1.center.scale.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.1X1.range.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.1X1.pca.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.1X1.ica.glmnet spdiff.cut.fctr,sprice.root2
## Max.cor.Y.rcv.1X1.spatialSign.glmnet spdiff.cut.fctr,sprice.root2
## max.nTuningRuns max.AUCpROC.fit
## MFO.myMFO_classfr 0 0.5000000
## Random.myrandom_classfr 0 0.5282909
## Max.cor.Y.rcv.1X1.glmnet 0 0.5653324
## Max.cor.Y.rcv.3X1.glmnet 25 0.5233645
## Max.cor.Y.rcv.3X3.glmnet 25 0.5233645
## Max.cor.Y.rcv.3X5.glmnet 25 0.5233645
## Max.cor.Y.rcv.5X1.glmnet 25 0.5536722
## Max.cor.Y.rcv.5X3.glmnet 25 0.5536722
## Max.cor.Y.rcv.5X5.glmnet 25 0.5536722
## Max.cor.Y.rcv.1X1.cp.0.rpart 0 0.6667806
## Max.cor.Y.rpart 5 0.5000000
## Interact.High.cor.Y.glmnet 25 0.5233645
## Low.cor.X.glmnet 25 0.5373832
## RFE.X.glm 1 0.7474431
## RFE.X.bayesglm 1 0.7252579
## RFE.X.glmnet 25 0.5373832
## RFE.X.rpart 5 0.5000000
## RFE.X.gbm 25 0.5548514
## RFE.X.rf 5 0.5000000
## RFE.X.nnet 25 0.7867880
## RFE.X.avNNet 25 0.7522924
## RFE.X.earth 5 0.5733667
## All.X.glmnet 25 0.5373832
## Max.cor.Y.rcv.1X1.Interact.glmnet 25 0.5233645
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 25 0.5233645
## Max.cor.Y.rcv.1X1.center.scale.glmnet 25 0.5233645
## Max.cor.Y.rcv.1X1.range.glmnet 25 0.5233645
## Max.cor.Y.rcv.1X1.pca.glmnet 20 0.5723638
## Max.cor.Y.rcv.1X1.ica.glmnet 20 0.5000000
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 25 0.5233645
## max.Sens.fit max.Spec.fit
## MFO.myMFO_classfr 1.0000000 0.00000000
## Random.myrandom_classfr 0.8042453 0.25233645
## Max.cor.Y.rcv.1X1.glmnet 0.9811321 0.14953271
## Max.cor.Y.rcv.3X1.glmnet 1.0000000 0.04672897
## Max.cor.Y.rcv.3X3.glmnet 1.0000000 0.04672897
## Max.cor.Y.rcv.3X5.glmnet 1.0000000 0.04672897
## Max.cor.Y.rcv.5X1.glmnet 0.9858491 0.12149533
## Max.cor.Y.rcv.5X3.glmnet 0.9858491 0.12149533
## Max.cor.Y.rcv.5X5.glmnet 0.9858491 0.12149533
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.9410377 0.39252336
## Max.cor.Y.rpart 1.0000000 0.00000000
## Interact.High.cor.Y.glmnet 1.0000000 0.04672897
## Low.cor.X.glmnet 1.0000000 0.07476636
## RFE.X.glm 0.9528302 0.54205607
## RFE.X.bayesglm 0.9551887 0.49532710
## RFE.X.glmnet 1.0000000 0.07476636
## RFE.X.rpart 1.0000000 0.00000000
## RFE.X.gbm 0.9882075 0.12149533
## RFE.X.rf 1.0000000 0.00000000
## RFE.X.nnet 0.8726415 0.70093458
## RFE.X.avNNet 0.9905660 0.51401869
## RFE.X.earth 0.9504717 0.19626168
## All.X.glmnet 1.0000000 0.07476636
## Max.cor.Y.rcv.1X1.Interact.glmnet 1.0000000 0.04672897
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 1.0000000 0.04672897
## Max.cor.Y.rcv.1X1.center.scale.glmnet 1.0000000 0.04672897
## Max.cor.Y.rcv.1X1.range.glmnet 1.0000000 0.04672897
## Max.cor.Y.rcv.1X1.pca.glmnet 0.9858491 0.15887850
## Max.cor.Y.rcv.1X1.ica.glmnet 1.0000000 0.00000000
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 1.0000000 0.04672897
## max.AUCROCR.fit
## MFO.myMFO_classfr 0.5000000
## Random.myrandom_classfr 0.5050586
## Max.cor.Y.rcv.1X1.glmnet 0.7580453
## Max.cor.Y.rcv.3X1.glmnet 0.7289279
## Max.cor.Y.rcv.3X3.glmnet 0.7289279
## Max.cor.Y.rcv.3X5.glmnet 0.7289279
## Max.cor.Y.rcv.5X1.glmnet 0.7585302
## Max.cor.Y.rcv.5X3.glmnet 0.7585302
## Max.cor.Y.rcv.5X5.glmnet 0.7585302
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.7336449
## Max.cor.Y.rpart 0.5000000
## Interact.High.cor.Y.glmnet 0.7465945
## Low.cor.X.glmnet 0.7637762
## RFE.X.glm 0.9143229
## RFE.X.bayesglm 0.8990698
## RFE.X.glmnet 0.7644375
## RFE.X.rpart 0.5000000
## RFE.X.gbm 0.7596544
## RFE.X.rf 0.8434800
## RFE.X.nnet 0.7898298
## RFE.X.avNNet 0.9486863
## RFE.X.earth 0.5733667
## All.X.glmnet 0.7644375
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.7289279
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.7288177
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.7289279
## Max.cor.Y.rcv.1X1.range.glmnet 0.7289279
## Max.cor.Y.rcv.1X1.pca.glmnet 0.7314407
## Max.cor.Y.rcv.1X1.ica.glmnet 0.6593414
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.7368850
## opt.prob.threshold.fit
## MFO.myMFO_classfr 0.2
## Random.myrandom_classfr 0.2
## Max.cor.Y.rcv.1X1.glmnet 0.2
## Max.cor.Y.rcv.3X1.glmnet 0.2
## Max.cor.Y.rcv.3X3.glmnet 0.2
## Max.cor.Y.rcv.3X5.glmnet 0.2
## Max.cor.Y.rcv.5X1.glmnet 0.2
## Max.cor.Y.rcv.5X3.glmnet 0.2
## Max.cor.Y.rcv.5X5.glmnet 0.2
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.3
## Max.cor.Y.rpart 0.2
## Interact.High.cor.Y.glmnet 0.2
## Low.cor.X.glmnet 0.2
## RFE.X.glm 0.3
## RFE.X.bayesglm 0.3
## RFE.X.glmnet 0.2
## RFE.X.rpart 0.2
## RFE.X.gbm 0.2
## RFE.X.rf 0.1
## RFE.X.nnet 0.4
## RFE.X.avNNet 0.3
## RFE.X.earth 0.1
## All.X.glmnet 0.2
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.2
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.2
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.2
## Max.cor.Y.rcv.1X1.range.glmnet 0.2
## Max.cor.Y.rcv.1X1.pca.glmnet 0.2
## Max.cor.Y.rcv.1X1.ica.glmnet 0.2
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.2
## max.f.score.fit max.Accuracy.fit
## MFO.myMFO_classfr 0.3354232 0.2015066
## Random.myrandom_classfr 0.3354232 0.2015066
## Max.cor.Y.rcv.1X1.glmnet 0.4778157 0.7118644
## Max.cor.Y.rcv.3X1.glmnet 0.4641638 0.8098207
## Max.cor.Y.rcv.3X3.glmnet 0.4641638 0.8110598
## Max.cor.Y.rcv.3X5.glmnet 0.4641638 0.8098088
## Max.cor.Y.rcv.5X1.glmnet 0.4778157 0.8154583
## Max.cor.Y.rcv.5X3.glmnet 0.4778157 0.8123155
## Max.cor.Y.rcv.5X5.glmnet 0.4778157 0.8101741
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.5073171 0.8097928
## Max.cor.Y.rpart 0.3354232 0.7897092
## Interact.High.cor.Y.glmnet 0.4810127 0.8098079
## Low.cor.X.glmnet 0.4671053 0.8048071
## RFE.X.glm 0.6693548 0.7407370
## RFE.X.bayesglm 0.6778243 0.7815306
## RFE.X.glmnet 0.4655738 0.8054348
## RFE.X.rpart 0.3354232 0.7941175
## RFE.X.gbm 0.4951768 0.7978522
## RFE.X.rf 0.3787879 0.7984934
## RFE.X.nnet 0.6386555 0.8022536
## RFE.X.avNNet 0.7961165 0.8003774
## RFE.X.earth 0.3354232 0.7941069
## All.X.glmnet 0.4655738 0.8041793
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.4641638 0.8110598
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.4641638 0.8110598
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.4641638 0.8110598
## Max.cor.Y.rcv.1X1.range.glmnet 0.4641638 0.8110598
## Max.cor.Y.rcv.1X1.pca.glmnet 0.4533333 0.8114577
## Max.cor.Y.rcv.1X1.ica.glmnet 0.4375000 0.8010051
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.4591195 0.8079246
## max.Kappa.fit max.AUCpROC.OOB
## MFO.myMFO_classfr 0.00000000 0.5000000
## Random.myrandom_classfr 0.00000000 0.5168449
## Max.cor.Y.rcv.1X1.glmnet 0.29829244 0.5529412
## Max.cor.Y.rcv.3X1.glmnet 0.08662750 0.5187166
## Max.cor.Y.rcv.3X3.glmnet 0.09445917 0.5187166
## Max.cor.Y.rcv.3X5.glmnet 0.08571096 0.5187166
## Max.cor.Y.rcv.5X1.glmnet 0.16272716 0.5497326
## Max.cor.Y.rcv.5X3.glmnet 0.14648283 0.5497326
## Max.cor.Y.rcv.5X5.glmnet 0.14504375 0.5497326
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.38974546 0.5914439
## Max.cor.Y.rpart 0.06035055 0.5000000
## Interact.High.cor.Y.glmnet 0.08548063 0.5187166
## Low.cor.X.glmnet 0.08253030 0.5406417
## RFE.X.glm 0.20258982 0.6304813
## RFE.X.bayesglm 0.21090324 0.6000000
## RFE.X.glmnet 0.08387615 0.5406417
## RFE.X.rpart 0.07284994 0.5000000
## RFE.X.gbm 0.07317897 0.5368984
## RFE.X.rf 0.00000000 0.5000000
## RFE.X.nnet 0.04244521 0.5810160
## RFE.X.avNNet 0.04399954 0.5810160
## RFE.X.earth 0.07123983 0.5719251
## All.X.glmnet 0.08122606 0.5406417
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.09445917 0.5187166
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.09445917 0.5187166
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.09445917 0.5187166
## Max.cor.Y.rcv.1X1.range.glmnet 0.09445917 0.5187166
## Max.cor.Y.rcv.1X1.pca.glmnet 0.18549208 0.5529412
## Max.cor.Y.rcv.1X1.ica.glmnet 0.01920281 0.5000000
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.07213928 0.5187166
## max.Sens.OOB max.Spec.OOB
## MFO.myMFO_classfr 1.0000000 0.00000000
## Random.myrandom_classfr 0.8155080 0.21818182
## Max.cor.Y.rcv.1X1.glmnet 0.9786096 0.12727273
## Max.cor.Y.rcv.3X1.glmnet 0.9919786 0.04545455
## Max.cor.Y.rcv.3X3.glmnet 0.9919786 0.04545455
## Max.cor.Y.rcv.3X5.glmnet 0.9919786 0.04545455
## Max.cor.Y.rcv.5X1.glmnet 0.9812834 0.11818182
## Max.cor.Y.rcv.5X3.glmnet 0.9812834 0.11818182
## Max.cor.Y.rcv.5X5.glmnet 0.9812834 0.11818182
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.9010695 0.28181818
## Max.cor.Y.rpart 1.0000000 0.00000000
## Interact.High.cor.Y.glmnet 0.9919786 0.04545455
## Low.cor.X.glmnet 0.9812834 0.10000000
## RFE.X.glm 0.8609626 0.40000000
## RFE.X.bayesglm 0.9090909 0.29090909
## RFE.X.glmnet 0.9812834 0.10000000
## RFE.X.rpart 1.0000000 0.00000000
## RFE.X.gbm 0.9919786 0.08181818
## RFE.X.rf 1.0000000 0.00000000
## RFE.X.nnet 0.7620321 0.40000000
## RFE.X.avNNet 0.9438503 0.21818182
## RFE.X.earth 0.9438503 0.20000000
## All.X.glmnet 0.9812834 0.10000000
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.9919786 0.04545455
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.9919786 0.04545455
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.9919786 0.04545455
## Max.cor.Y.rcv.1X1.range.glmnet 0.9919786 0.04545455
## Max.cor.Y.rcv.1X1.pca.glmnet 0.9786096 0.12727273
## Max.cor.Y.rcv.1X1.ica.glmnet 1.0000000 0.00000000
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.9919786 0.04545455
## max.AUCROCR.OOB
## MFO.myMFO_classfr 0.5000000
## Random.myrandom_classfr 0.4962567
## Max.cor.Y.rcv.1X1.glmnet 0.7423189
## Max.cor.Y.rcv.3X1.glmnet 0.7148517
## Max.cor.Y.rcv.3X3.glmnet 0.7148517
## Max.cor.Y.rcv.3X5.glmnet 0.7148517
## Max.cor.Y.rcv.5X1.glmnet 0.7398882
## Max.cor.Y.rcv.5X3.glmnet 0.7398882
## Max.cor.Y.rcv.5X5.glmnet 0.7398882
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.6280870
## Max.cor.Y.rpart 0.5000000
## Interact.High.cor.Y.glmnet 0.7259844
## Low.cor.X.glmnet 0.7107560
## RFE.X.glm 0.7157997
## RFE.X.bayesglm 0.7037555
## RFE.X.glmnet 0.7120929
## RFE.X.rpart 0.5000000
## RFE.X.gbm 0.6834346
## RFE.X.rf 0.6501823
## RFE.X.nnet 0.5818060
## RFE.X.avNNet 0.6544361
## RFE.X.earth 0.5719251
## All.X.glmnet 0.7120929
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.7148517
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.7150705
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.7148517
## Max.cor.Y.rcv.1X1.range.glmnet 0.7148517
## Max.cor.Y.rcv.1X1.pca.glmnet 0.7029655
## Max.cor.Y.rcv.1X1.ica.glmnet 0.6876276
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.7264706
## opt.prob.threshold.OOB
## MFO.myMFO_classfr 0.2
## Random.myrandom_classfr 0.2
## Max.cor.Y.rcv.1X1.glmnet 0.2
## Max.cor.Y.rcv.3X1.glmnet 0.2
## Max.cor.Y.rcv.3X3.glmnet 0.2
## Max.cor.Y.rcv.3X5.glmnet 0.2
## Max.cor.Y.rcv.5X1.glmnet 0.2
## Max.cor.Y.rcv.5X3.glmnet 0.2
## Max.cor.Y.rcv.5X5.glmnet 0.2
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.2
## Max.cor.Y.rpart 0.2
## Interact.High.cor.Y.glmnet 0.2
## Low.cor.X.glmnet 0.2
## RFE.X.glm 0.3
## RFE.X.bayesglm 0.3
## RFE.X.glmnet 0.2
## RFE.X.rpart 0.2
## RFE.X.gbm 0.2
## RFE.X.rf 0.0
## RFE.X.nnet 0.0
## RFE.X.avNNet 0.3
## RFE.X.earth 0.1
## All.X.glmnet 0.2
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.2
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.2
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.2
## Max.cor.Y.rcv.1X1.range.glmnet 0.2
## Max.cor.Y.rcv.1X1.pca.glmnet 0.2
## Max.cor.Y.rcv.1X1.ica.glmnet 0.2
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.2
## max.f.score.OOB max.Accuracy.OOB
## MFO.myMFO_classfr 0.3703704 0.2272727
## Random.myrandom_classfr 0.3703704 0.2272727
## Max.cor.Y.rcv.1X1.glmnet 0.5175719 0.6880165
## Max.cor.Y.rcv.3X1.glmnet 0.4868421 0.6776860
## Max.cor.Y.rcv.3X3.glmnet 0.4868421 0.6776860
## Max.cor.Y.rcv.3X5.glmnet 0.4868421 0.6776860
## Max.cor.Y.rcv.5X1.glmnet 0.5047923 0.6797521
## Max.cor.Y.rcv.5X3.glmnet 0.5047923 0.6797521
## Max.cor.Y.rcv.5X5.glmnet 0.5047923 0.6797521
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.4380165 0.7190083
## Max.cor.Y.rpart 0.3703704 0.2272727
## Interact.High.cor.Y.glmnet 0.4863222 0.6508264
## Low.cor.X.glmnet 0.4789644 0.6673554
## RFE.X.glm 0.4800000 0.7314050
## RFE.X.bayesglm 0.4682540 0.7231405
## RFE.X.glmnet 0.4774194 0.6652893
## RFE.X.rpart 0.3703704 0.2272727
## RFE.X.gbm 0.4312500 0.6239669
## RFE.X.rf 0.3703704 0.2272727
## RFE.X.nnet 0.3703704 0.2272727
## RFE.X.avNNet 0.4131455 0.7417355
## RFE.X.earth 0.3703704 0.2272727
## All.X.glmnet 0.4774194 0.6652893
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.4868421 0.6776860
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.4868421 0.6776860
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.4868421 0.6776860
## Max.cor.Y.rcv.1X1.range.glmnet 0.4868421 0.6776860
## Max.cor.Y.rcv.1X1.pca.glmnet 0.4620253 0.6487603
## Max.cor.Y.rcv.1X1.ica.glmnet 0.4878049 0.6528926
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.5015106 0.6590909
## max.Kappa.OOB
## MFO.myMFO_classfr 0.0000000
## Random.myrandom_classfr 0.0000000
## Max.cor.Y.rcv.1X1.glmnet 0.3158979
## Max.cor.Y.rcv.3X1.glmnet 0.2771693
## Max.cor.Y.rcv.3X3.glmnet 0.2771693
## Max.cor.Y.rcv.3X5.glmnet 0.2771693
## Max.cor.Y.rcv.5X1.glmnet 0.2977759
## Max.cor.Y.rcv.5X3.glmnet 0.2977759
## Max.cor.Y.rcv.5X5.glmnet 0.2977759
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.2527473
## Max.cor.Y.rpart 0.0000000
## Interact.High.cor.Y.glmnet 0.2634707
## Low.cor.X.glmnet 0.2633111
## RFE.X.glm 0.3024390
## RFE.X.bayesglm 0.2851600
## RFE.X.glmnet 0.2605809
## RFE.X.rpart 0.0000000
## RFE.X.gbm 0.1894737
## RFE.X.rf 0.0000000
## RFE.X.nnet 0.0000000
## RFE.X.avNNet 0.2478118
## RFE.X.earth 0.0000000
## All.X.glmnet 0.2605809
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.2771693
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.2771693
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.2771693
## Max.cor.Y.rcv.1X1.range.glmnet 0.2771693
## Max.cor.Y.rcv.1X1.pca.glmnet 0.2354865
## Max.cor.Y.rcv.1X1.ica.glmnet 0.2660842
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.2843060
## inv.elapsedtime.everything
## MFO.myMFO_classfr 3.61010830
## Random.myrandom_classfr 3.48432056
## Max.cor.Y.rcv.1X1.glmnet 1.38504155
## Max.cor.Y.rcv.3X1.glmnet 0.68073519
## Max.cor.Y.rcv.3X3.glmnet 0.47687172
## Max.cor.Y.rcv.3X5.glmnet 0.40209087
## Max.cor.Y.rcv.5X1.glmnet 0.59347181
## Max.cor.Y.rcv.5X3.glmnet 0.40502228
## Max.cor.Y.rcv.5X5.glmnet 0.32499188
## Max.cor.Y.rcv.1X1.cp.0.rpart 1.44300144
## Max.cor.Y.rpart 0.65530799
## Interact.High.cor.Y.glmnet 0.08480326
## Low.cor.X.glmnet 0.11087704
## RFE.X.glm 0.28968714
## RFE.X.bayesglm 0.16345211
## RFE.X.glmnet 0.04628987
## RFE.X.rpart 0.30599755
## RFE.X.gbm 0.12110936
## RFE.X.rf 0.14503263
## RFE.X.nnet 0.03820731
## RFE.X.avNNet 0.01383585
## RFE.X.earth 0.10088781
## All.X.glmnet 0.04684280
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.44843049
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.38759690
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.43327556
## Max.cor.Y.rcv.1X1.range.glmnet 0.43084877
## Max.cor.Y.rcv.1X1.pca.glmnet 0.47778309
## Max.cor.Y.rcv.1X1.ica.glmnet 0.44052863
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.40404040
## inv.elapsedtime.final
## MFO.myMFO_classfr 333.3333333
## Random.myrandom_classfr 500.0000000
## Max.cor.Y.rcv.1X1.glmnet 31.2500000
## Max.cor.Y.rcv.3X1.glmnet 38.4615385
## Max.cor.Y.rcv.3X3.glmnet 41.6666667
## Max.cor.Y.rcv.3X5.glmnet 38.4615385
## Max.cor.Y.rcv.5X1.glmnet 40.0000000
## Max.cor.Y.rcv.5X3.glmnet 41.6666667
## Max.cor.Y.rcv.5X5.glmnet 41.6666667
## Max.cor.Y.rcv.1X1.cp.0.rpart 66.6666667
## Max.cor.Y.rpart 76.9230769
## Interact.High.cor.Y.glmnet 0.7163324
## Low.cor.X.glmnet 1.0080645
## RFE.X.glm 4.2553191
## RFE.X.bayesglm 2.1367521
## RFE.X.glmnet 1.0288066
## RFE.X.rpart 14.9253731
## RFE.X.gbm 3.8461538
## RFE.X.rf 0.8163265
## RFE.X.nnet 8.6956522
## RFE.X.avNNet 0.9578544
## RFE.X.earth 0.6671114
## All.X.glmnet 0.9861933
## Max.cor.Y.rcv.1X1.Interact.glmnet 40.0000000
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 20.4081633
## Max.cor.Y.rcv.1X1.center.scale.glmnet 32.2580645
## Max.cor.Y.rcv.1X1.range.glmnet 34.4827586
## Max.cor.Y.rcv.1X1.pca.glmnet 50.0000000
## Max.cor.Y.rcv.1X1.ica.glmnet 47.6190476
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 24.3902439
print(myplot_radar(radar_inp_df=plt_models_df))
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 30. Consider specifying shapes manually if you must have them.
## Warning: Removed 480 rows containing missing values (geom_point).
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 30. Consider specifying shapes manually if you must have them.
# print(myplot_radar(radar_inp_df=subset(plt_models_df,
# !(mdl_id %in% grep("random|MFO", plt_models_df$id, value=TRUE)))))
# Compute CI for <metric>SD
glb_models_df <- mutate(glb_models_df,
max.df = ifelse(max.nTuningRuns > 1, max.nTuningRuns - 1, NA),
min.sd2ci.scaler = ifelse(is.na(max.df), NA, qt(0.975, max.df)))
for (var in grep("SD", names(glb_models_df), value=TRUE)) {
# Does CI alredy exist ?
var_components <- unlist(strsplit(var, "SD"))
varActul <- paste0(var_components[1], var_components[2])
varUpper <- paste0(var_components[1], "Upper", var_components[2])
varLower <- paste0(var_components[1], "Lower", var_components[2])
if (varUpper %in% names(glb_models_df)) {
warning(varUpper, " already exists in glb_models_df")
# Assuming Lower also exists
next
}
print(sprintf("var:%s", var))
# CI is dependent on sample size in t distribution; df=n-1
glb_models_df[, varUpper] <- glb_models_df[, varActul] +
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
glb_models_df[, varLower] <- glb_models_df[, varActul] -
glb_models_df[, "min.sd2ci.scaler"] * glb_models_df[, var]
}
## Warning: max.AccuracyUpper.fit already exists in glb_models_df
## [1] "var:max.KappaSD.fit"
# Plot metrics with CI
plt_models_df <- glb_models_df[, "id", FALSE]
pltCI_models_df <- glb_models_df[, "id", FALSE]
for (var in grep("Upper", names(glb_models_df), value=TRUE)) {
var_components <- unlist(strsplit(var, "Upper"))
col_name <- unlist(paste(var_components, collapse=""))
plt_models_df[, col_name] <- glb_models_df[, col_name]
for (name in paste0(var_components[1], c("Upper", "Lower"), var_components[2]))
pltCI_models_df[, name] <- glb_models_df[, name]
}
build_statsCI_data <- function(plt_models_df) {
mltd_models_df <- melt(plt_models_df, id.vars="id")
mltd_models_df$data <- sapply(1:nrow(mltd_models_df),
function(row_ix) tail(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]), "[.]")), 1))
mltd_models_df$label <- sapply(1:nrow(mltd_models_df),
function(row_ix) head(unlist(strsplit(as.character(
mltd_models_df[row_ix, "variable"]),
paste0(".", mltd_models_df[row_ix, "data"]))), 1))
#print(mltd_models_df)
return(mltd_models_df)
}
mltd_models_df <- build_statsCI_data(plt_models_df)
mltdCI_models_df <- melt(pltCI_models_df, id.vars="id")
for (row_ix in 1:nrow(mltdCI_models_df)) {
for (type in c("Upper", "Lower")) {
if (length(var_components <- unlist(strsplit(
as.character(mltdCI_models_df[row_ix, "variable"]), type))) > 1) {
#print(sprintf("row_ix:%d; type:%s; ", row_ix, type))
mltdCI_models_df[row_ix, "label"] <- var_components[1]
mltdCI_models_df[row_ix, "data"] <-
unlist(strsplit(var_components[2], "[.]"))[2]
mltdCI_models_df[row_ix, "type"] <- type
break
}
}
}
wideCI_models_df <- reshape(subset(mltdCI_models_df, select=-variable),
timevar="type",
idvar=setdiff(names(mltdCI_models_df), c("type", "value", "variable")),
direction="wide")
#print(wideCI_models_df)
mrgdCI_models_df <- merge(wideCI_models_df, mltd_models_df, all.x=TRUE)
#print(mrgdCI_models_df)
# Merge stats back in if CIs don't exist
goback_vars <- c()
for (var in unique(mltd_models_df$label)) {
for (type in unique(mltd_models_df$data)) {
var_type <- paste0(var, ".", type)
# if this data is already present, next
if (var_type %in% unique(paste(mltd_models_df$label, mltd_models_df$data,
sep=".")))
next
#print(sprintf("var_type:%s", var_type))
goback_vars <- c(goback_vars, var_type)
}
}
if (length(goback_vars) > 0) {
mltd_goback_df <- build_statsCI_data(glb_models_df[, c("id", goback_vars)])
mltd_models_df <- rbind(mltd_models_df, mltd_goback_df)
}
# mltd_models_df <- merge(mltd_models_df, glb_models_df[, c("id", "model_method")],
# all.x=TRUE)
png(paste0(glb_out_pfx, "models_bar.png"), width=480*3, height=480*2)
#print(gp <- myplot_bar(mltd_models_df, "id", "value", colorcol_name="model_method") +
print(gp <- myplot_bar(df=mltd_models_df, xcol_name="id", ycol_names="value") +
geom_errorbar(data=mrgdCI_models_df,
mapping=aes(x=mdl_id, ymax=value.Upper, ymin=value.Lower), width=0.5) +
facet_grid(label ~ data, scales="free") +
theme(axis.text.x = element_text(angle = 90,vjust = 0.5)))
## Warning: Removed 7 rows containing missing values (geom_errorbar).
dev.off()
## quartz_off_screen
## 2
print(gp)
## Warning: Removed 7 rows containing missing values (geom_errorbar).
dsp_models_cols <- c("id",
glbMdlMetricsEval[glbMdlMetricsEval %in% names(glb_models_df)],
grep("opt.", names(glb_models_df), fixed = TRUE, value = TRUE))
# if (glb_is_classification && glb_is_binomial)
# dsp_models_cols <- c(dsp_models_cols, "opt.prob.threshold.OOB")
print(dsp_models_df <- orderBy(get_model_sel_frmla(), glb_models_df)[, dsp_models_cols])
## id max.AUCROCR.OOB max.AUCpROC.OOB
## 3 Max.cor.Y.rcv.1X1.glmnet 0.7423189 0.5529412
## 7 Max.cor.Y.rcv.5X1.glmnet 0.7398882 0.5497326
## 8 Max.cor.Y.rcv.5X3.glmnet 0.7398882 0.5497326
## 9 Max.cor.Y.rcv.5X5.glmnet 0.7398882 0.5497326
## 30 Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.7264706 0.5187166
## 12 Interact.High.cor.Y.glmnet 0.7259844 0.5187166
## 14 RFE.X.glm 0.7157997 0.6304813
## 25 Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.7150705 0.5187166
## 5 Max.cor.Y.rcv.3X3.glmnet 0.7148517 0.5187166
## 24 Max.cor.Y.rcv.1X1.Interact.glmnet 0.7148517 0.5187166
## 26 Max.cor.Y.rcv.1X1.center.scale.glmnet 0.7148517 0.5187166
## 27 Max.cor.Y.rcv.1X1.range.glmnet 0.7148517 0.5187166
## 4 Max.cor.Y.rcv.3X1.glmnet 0.7148517 0.5187166
## 6 Max.cor.Y.rcv.3X5.glmnet 0.7148517 0.5187166
## 16 RFE.X.glmnet 0.7120929 0.5406417
## 23 All.X.glmnet 0.7120929 0.5406417
## 13 Low.cor.X.glmnet 0.7107560 0.5406417
## 15 RFE.X.bayesglm 0.7037555 0.6000000
## 28 Max.cor.Y.rcv.1X1.pca.glmnet 0.7029655 0.5529412
## 29 Max.cor.Y.rcv.1X1.ica.glmnet 0.6876276 0.5000000
## 18 RFE.X.gbm 0.6834346 0.5368984
## 21 RFE.X.avNNet 0.6544361 0.5810160
## 19 RFE.X.rf 0.6501823 0.5000000
## 10 Max.cor.Y.rcv.1X1.cp.0.rpart 0.6280870 0.5914439
## 20 RFE.X.nnet 0.5818060 0.5810160
## 22 RFE.X.earth 0.5719251 0.5719251
## 17 RFE.X.rpart 0.5000000 0.5000000
## 11 Max.cor.Y.rpart 0.5000000 0.5000000
## 1 MFO.myMFO_classfr 0.5000000 0.5000000
## 2 Random.myrandom_classfr 0.4962567 0.5168449
## max.Accuracy.OOB max.Accuracy.fit opt.prob.threshold.fit
## 3 0.6880165 0.7118644 0.2
## 7 0.6797521 0.8154583 0.2
## 8 0.6797521 0.8123155 0.2
## 9 0.6797521 0.8101741 0.2
## 30 0.6590909 0.8079246 0.2
## 12 0.6508264 0.8098079 0.2
## 14 0.7314050 0.7407370 0.3
## 25 0.6776860 0.8110598 0.2
## 5 0.6776860 0.8110598 0.2
## 24 0.6776860 0.8110598 0.2
## 26 0.6776860 0.8110598 0.2
## 27 0.6776860 0.8110598 0.2
## 4 0.6776860 0.8098207 0.2
## 6 0.6776860 0.8098088 0.2
## 16 0.6652893 0.8054348 0.2
## 23 0.6652893 0.8041793 0.2
## 13 0.6673554 0.8048071 0.2
## 15 0.7231405 0.7815306 0.3
## 28 0.6487603 0.8114577 0.2
## 29 0.6528926 0.8010051 0.2
## 18 0.6239669 0.7978522 0.2
## 21 0.7417355 0.8003774 0.3
## 19 0.2272727 0.7984934 0.1
## 10 0.7190083 0.8097928 0.3
## 20 0.2272727 0.8022536 0.4
## 22 0.2272727 0.7941069 0.1
## 17 0.2272727 0.7941175 0.2
## 11 0.2272727 0.7897092 0.2
## 1 0.2272727 0.2015066 0.2
## 2 0.2272727 0.2015066 0.2
## opt.prob.threshold.OOB
## 3 0.2
## 7 0.2
## 8 0.2
## 9 0.2
## 30 0.2
## 12 0.2
## 14 0.3
## 25 0.2
## 5 0.2
## 24 0.2
## 26 0.2
## 27 0.2
## 4 0.2
## 6 0.2
## 16 0.2
## 23 0.2
## 13 0.2
## 15 0.3
## 28 0.2
## 29 0.2
## 18 0.2
## 21 0.3
## 19 0.0
## 10 0.2
## 20 0.0
## 22 0.1
## 17 0.2
## 11 0.2
## 1 0.2
## 2 0.2
print(myplot_radar(radar_inp_df = dsp_models_df))
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 30. Consider specifying shapes manually if you must have them.
## Warning: Removed 168 rows containing missing values (geom_point).
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: The shape palette can deal with a maximum of 6 discrete values
## because more than 6 becomes difficult to discriminate; you have
## 30. Consider specifying shapes manually if you must have them.
print("Metrics used for model selection:"); print(get_model_sel_frmla())
## [1] "Metrics used for model selection:"
## ~-max.AUCROCR.OOB - max.AUCpROC.OOB - max.Accuracy.OOB - max.Accuracy.fit -
## opt.prob.threshold.OOB
## <environment: 0x7fa3270ed698>
print(sprintf("Best model id: %s", dsp_models_df[1, "id"]))
## [1] "Best model id: Max.cor.Y.rcv.1X1.glmnet"
glb_get_predictions <- function(df, mdl_id, rsp_var_out, prob_threshold_def=NULL, verbose=FALSE) {
mdl <- glb_models_lst[[mdl_id]]
#rsp_var_out <- paste0(rsp_var_out, mdl_id)
rsp_var_out <- paste0(glb_rsp_var, ".predict.")
predct_var_name <- paste0(rsp_var_out, mdl_id)
predct_prob_var_name <- paste0(rsp_var_out, mdl_id, ".prob")
predct_accurate_var_name <- paste0(rsp_var_out, mdl_id, ".accurate")
predct_error_var_name <- paste0(rsp_var_out, mdl_id, ".err")
predct_erabs_var_name <- paste0(rsp_var_out, mdl_id, ".err.abs")
if (glb_is_regression) {
df[, predct_var_name] <- predict(mdl, newdata=df, type="raw")
if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_var_name) +
facet_wrap(reformulate(glb_category_var), scales = "free") +
stat_smooth(method="glm"))
df[, predct_error_var_name] <- df[, predct_var_name] - df[, glb_rsp_var]
if (verbose) print(myplot_scatter(df, predct_var_name, predct_error_var_name) +
#facet_wrap(reformulate(glb_category_var), scales = "free") +
stat_smooth(method="auto"))
if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_error_var_name) +
#facet_wrap(reformulate(glb_category_var), scales = "free") +
stat_smooth(method="glm"))
df[, predct_erabs_var_name] <- abs(df[, predct_error_var_name])
if (verbose) print(head(orderBy(reformulate(c("-", predct_erabs_var_name)), df)))
df[, predct_accurate_var_name] <- (df[, glb_rsp_var] == df[, predct_var_name])
}
if (glb_is_classification && glb_is_binomial) {
prob_threshold <- glb_models_df[glb_models_df$id == mdl_id,
"opt.prob.threshold.OOB"]
if (is.null(prob_threshold) || is.na(prob_threshold)) {
warning("Using default probability threshold: ", prob_threshold_def)
if (is.null(prob_threshold <- prob_threshold_def))
stop("Default probability threshold is NULL")
}
df[, predct_prob_var_name] <- predict(mdl, newdata = df, type = "prob")[, 2]
df[, predct_var_name] <-
factor(levels(df[, glb_rsp_var])[
(df[, predct_prob_var_name] >=
prob_threshold) * 1 + 1], levels(df[, glb_rsp_var]))
# if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_var_name) +
# facet_wrap(reformulate(glb_category_var), scales = "free") +
# stat_smooth(method="glm"))
df[, predct_error_var_name] <- df[, predct_var_name] != df[, glb_rsp_var]
# if (verbose) print(myplot_scatter(df, predct_var_name, predct_error_var_name) +
# #facet_wrap(reformulate(glb_category_var), scales = "free") +
# stat_smooth(method="auto"))
# if (verbose) print(myplot_scatter(df, glb_rsp_var, predct_error_var_name) +
# #facet_wrap(reformulate(glb_category_var), scales = "free") +
# stat_smooth(method="glm"))
# if prediction is a TP (true +ve), measure distance from 1.0
tp <- which((df[, predct_var_name] == df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[2]))
df[tp, predct_erabs_var_name] <- abs(1 - df[tp, predct_prob_var_name])
#rowIx <- which.max(df[tp, predct_erabs_var_name]); df[tp, c(glb_id_var, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a TN (true -ve), measure distance from 0.0
tn <- which((df[, predct_var_name] == df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[1]))
df[tn, predct_erabs_var_name] <- abs(0 - df[tn, predct_prob_var_name])
#rowIx <- which.max(df[tn, predct_erabs_var_name]); df[tn, c(glb_id_var, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a FP (flse +ve), measure distance from 0.0
fp <- which((df[, predct_var_name] != df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[2]))
df[fp, predct_erabs_var_name] <- abs(0 - df[fp, predct_prob_var_name])
#rowIx <- which.max(df[fp, predct_erabs_var_name]); df[fp, c(glb_id_var, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
# if prediction is a FN (flse -ve), measure distance from 1.0
fn <- which((df[, predct_var_name] != df[, glb_rsp_var]) &
(df[, predct_var_name] == levels(df[, glb_rsp_var])[1]))
df[fn, predct_erabs_var_name] <- abs(1 - df[fn, predct_prob_var_name])
#rowIx <- which.max(df[fn, predct_erabs_var_name]); df[fn, c(glb_id_var, glb_rsp_var, predct_var_name, predct_prob_var_name, predct_erabs_var_name)][rowIx, ]
if (verbose) print(head(orderBy(reformulate(c("-", predct_erabs_var_name)), df)))
df[, predct_accurate_var_name] <- (df[, glb_rsp_var] == df[, predct_var_name])
}
if (glb_is_classification && !glb_is_binomial) {
df[, predct_var_name] <- predict(mdl, newdata = df, type = "raw")
df[, paste0(predct_var_name, ".prob")] <-
predict(mdl, newdata = df, type = "prob")
stop("Multinomial prediction error calculation needs to be implemented...")
}
return(df)
}
#stop(here"); glb_to_sav(); glb_allobs_df <- sav_allobs_df; glb_trnobs_df <- sav_trnobs_df; glb_fitobs_df <- sav_fitobs_df; glb_OOBobs_df <- sav_OOBobs_df; sav_models_df <- glb_models_df; glb_models_df <- sav_models_df; glb_featsimp_df <- sav_featsimp_df
myget_category_stats <- function(obs_df, mdl_id, label) {
require(dplyr)
require(lazyeval)
predct_var_name <- paste0(glb_rsp_var_out, mdl_id)
predct_error_var_name <- paste0(glb_rsp_var_out, mdl_id, ".err.abs")
if (!predct_var_name %in% names(obs_df))
obs_df <- glb_get_predictions(obs_df, mdl_id, glb_rsp_var_out)
tmp_obs_df <- obs_df %>%
dplyr::select_(glb_category_var, glb_rsp_var, predct_var_name, predct_error_var_name)
#dplyr::rename(startprice.log10.predict.RFE.X.glmnet.err=error_abs_OOB)
names(tmp_obs_df)[length(names(tmp_obs_df))] <- paste0("err.abs.", label)
ret_ctgry_df <- tmp_obs_df %>%
dplyr::group_by_(glb_category_var) %>%
dplyr::summarise_(#interp(~sum(abs(var)), var=as.name(glb_rsp_var)),
interp(~sum(var), var=as.name(paste0("err.abs.", label))),
interp(~mean(var), var=as.name(paste0("err.abs.", label))),
interp(~n()))
names(ret_ctgry_df) <- c(glb_category_var,
#paste0(glb_rsp_var, ".abs.", label, ".sum"),
paste0("err.abs.", label, ".sum"),
paste0("err.abs.", label, ".mean"),
paste0(".n.", label))
ret_ctgry_df <- dplyr::ungroup(ret_ctgry_df)
#colSums(ret_ctgry_df[, -grep(glb_category_var, names(ret_ctgry_df))])
return(ret_ctgry_df)
}
#print(colSums((ctgry_df <- myget_category_stats(obs_df=glb_fitobs_df, mdl_id="", label="fit"))[, -grep(glb_category_var, names(ctgry_df))]))
if (!is.null(glb_mdl_ensemble)) {
mdl_id_pfx <- "Ensemble"
if (#(glb_is_regression) |
((glb_is_classification) & (!glb_is_binomial)))
stop("Ensemble models not implemented yet for multinomial classification")
if (glb_mdl_ensemble == "auto") {
mdl_id_pfx <- paste0(mdl_id_pfx, ".auto")
tmp_models_df <- orderBy(get_model_sel_frmla(), glb_models_df)
row.names(tmp_models_df) <- tmp_models_df$id
# mdl_threshold_pos <- min(which(tmp_models_df$id %in%
# c("MFO.myMFO_classfr", "Baseline.mybaseln_classfr"))) - 1
mdl_threshold_pos <-
min(which(grepl("MFO|Random|Baseline", tmp_models_df$id))) - 1
glb_mdl_ensemble <- tmp_models_df$id[1:mdl_threshold_pos]
}
for (mdl_id in glb_mdl_ensemble) {
if (!(mdl_id %in% names(glb_models_lst))) {
warning("Model ", mdl_id, " in glb_model_ensemble not found !")
next
}
glb_fitobs_df <- glb_get_predictions(df = glb_fitobs_df, mdl_id,
glb_rsp_var_out)
glb_OOBobs_df <- glb_get_predictions(df = glb_OOBobs_df, mdl_id,
glb_rsp_var_out)
}
#mdl_id_pfx <- "Ensemble.RFE"; mdlId <- paste0(mdl_id_pfx, ".glmnet")
#glb_mdl_ensemble <- gsub(glb_rsp_var_out, "", grep("RFE\\.X\\.(?!Interact)", row.names(glb_featsimp_df), perl = TRUE, value = TRUE), fixed = TRUE)
#varImp(glb_models_lst[[mdlId]])
#cor_df <- data.frame(cor=cor(glb_fitobs_df[, glb_rsp_var], glb_fitobs_df[, paste(glb_rsp_var_out, glb_mdl_ensemble)], use="pairwise.complete.obs"))
#glb_fitobs_df <- glb_get_predictions(df=glb_fitobs_df, "Ensemble.glmnet", glb_rsp_var_out);print(colSums((ctgry_df <- myget_category_stats(obs_df=glb_fitobs_df, mdl_id="Ensemble.glmnet", label="fit"))[, -grep(glb_category_var, names(ctgry_df))]))
### bid0_sp
# Better than MFO; models.n=28; min.RMSE.fit=0.0521233; err.abs.fit.sum=7.3631895
# old: Top x from auto; models.n= 5; min.RMSE.fit=0.06311047; err.abs.fit.sum=9.5937080
# RFE only ; models.n=16; min.RMSE.fit=0.05148588; err.abs.fit.sum=7.2875091
# RFE subset only ;models.n= 5; min.RMSE.fit=0.06040702; err.abs.fit.sum=9.059088
# RFE subset only ;models.n= 9; min.RMSE.fit=0.05933167; err.abs.fit.sum=8.7421288
# RFE subset only ;models.n=15; min.RMSE.fit=0.0584607; err.abs.fit.sum=8.5902066
# RFE subset only ;models.n=17; min.RMSE.fit=0.05496899; err.abs.fit.sum=8.0170431
# RFE subset only ;models.n=18; min.RMSE.fit=0.05441577; err.abs.fit.sum=7.837223
# RFE subset only ;models.n=16; min.RMSE.fit=0.05441577; err.abs.fit.sum=7.837223
### bid0_sp
### bid1_sp
# "auto"; err.abs.fit.sum=76.699774; min.RMSE.fit=0.2186429
# "RFE.X.*"; err.abs.fit.sum=; min.RMSE.fit=0.221114
### bid1_sp
indep_vars <- paste(glb_rsp_var_out, glb_mdl_ensemble, sep = "")
if (glb_is_classification)
indep_vars <- paste(indep_vars, ".prob", sep = "")
# Some models in glb_mdl_ensemble might not be fitted e.g. RFE.X.Interact
indep_vars <- intersect(indep_vars, names(glb_fitobs_df))
# indep_vars <- grep(glb_rsp_var_out, names(glb_fitobs_df), fixed=TRUE, value=TRUE)
# if (glb_is_regression)
# indep_vars <- indep_vars[!grepl("(err\\.abs|accurate)$", indep_vars)]
# if (glb_is_classification && glb_is_binomial)
# indep_vars <- grep("prob$", indep_vars, value=TRUE) else
# indep_vars <- indep_vars[!grepl("err$", indep_vars)]
#rfe_fit_ens_results <- myrun_rfe(glb_fitobs_df, indep_vars)
for (method in c("glmnet")) {
#sav_models_df <- glb_models_df; all.equal(sav_models_df, glb_models_df)
#glb_models_df <- sav_models_df; print(glb_models_df$id)
ret_lst <- myfit_mdl(
mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = mdl_id_pfx,
type = glb_model_type, tune.df = NULL,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = method)),
indep_vars = indep_vars, rsp_var = glb_rsp_var,
fit_df = glb_fitobs_df, OOB_df = glb_OOBobs_df)
}
dsp_models_df <- get_dsp_models_df()
}
## Warning in if (glb_mdl_ensemble == "auto") {: the condition has length > 1
## and only the first element will be used
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning: Model RFE.X.Interact.glmnet in glb_model_ensemble not found !
## [1] "fitting model: Ensemble.glmnet"
## [1] " indep_vars: sold.fctr.predict.RFE.X.rpart.prob,sold.fctr.predict.RFE.X.earth.prob,sold.fctr.predict.RFE.X.glmnet.prob,sold.fctr.predict.RFE.X.glm.prob,sold.fctr.predict.RFE.X.bayesglm.prob,sold.fctr.predict.RFE.X.nnet.prob,sold.fctr.predict.RFE.X.avNNet.prob,sold.fctr.predict.RFE.X.gbm.prob"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.55, lambda = 0.000293 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdl_id_pfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 93 -none- numeric
## beta 744 dgCMatrix S4
## df 93 -none- numeric
## dim 2 -none- numeric
## lambda 93 -none- numeric
## dev.ratio 93 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 8 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.772081
## sold.fctr.predict.RFE.X.avNNet.prob
## 18.665537
## sold.fctr.predict.RFE.X.bayesglm.prob
## -5.982486
## sold.fctr.predict.RFE.X.earth.prob
## -2.498722
## sold.fctr.predict.RFE.X.gbm.prob
## -12.574421
## sold.fctr.predict.RFE.X.glm.prob
## 6.329718
## sold.fctr.predict.RFE.X.glmnet.prob
## 4.888828
## sold.fctr.predict.RFE.X.nnet.prob
## 1.865833
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.777437
## sold.fctr.predict.RFE.X.avNNet.prob
## 18.712700
## sold.fctr.predict.RFE.X.bayesglm.prob
## -6.073924
## sold.fctr.predict.RFE.X.earth.prob
## -2.512910
## sold.fctr.predict.RFE.X.gbm.prob
## -12.632938
## sold.fctr.predict.RFE.X.glm.prob
## 6.388467
## sold.fctr.predict.RFE.X.glmnet.prob
## 4.968946
## sold.fctr.predict.RFE.X.nnet.prob
## 1.873890
## sold.fctr sold.fctr.predict.Ensemble.glmnet.N
## 1 N 412
## 2 Y 14
## sold.fctr.predict.Ensemble.glmnet.Y
## 1 12
## 2 93
## Prediction
## Reference N Y
## N 412 12
## Y 14 93
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.510358e-01 8.467737e-01 9.290766e-01 9.677691e-01 7.984934e-01
## AccuracyPValue McnemarPValue
## 4.295099e-24 8.445193e-01
## sold.fctr sold.fctr.predict.Ensemble.glmnet.N
## 1 N 300
## 2 Y 56
## sold.fctr.predict.Ensemble.glmnet.Y
## 1 74
## 2 54
## Prediction
## Reference N Y
## N 300 74
## Y 56 54
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 0.7314050 0.2770475 0.6895485 0.7704077 0.7727273
## AccuracyPValue McnemarPValue
## 0.9856745 0.1359619
## id
## 1 Ensemble.glmnet
## feats
## 1 sold.fctr.predict.RFE.X.rpart.prob,sold.fctr.predict.RFE.X.earth.prob,sold.fctr.predict.RFE.X.glmnet.prob,sold.fctr.predict.RFE.X.glm.prob,sold.fctr.predict.RFE.X.bayesglm.prob,sold.fctr.predict.RFE.X.nnet.prob,sold.fctr.predict.RFE.X.avNNet.prob,sold.fctr.predict.RFE.X.gbm.prob
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 2.309 0.026
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.9087683 0.9764151 0.8411215 0.9733953
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.4 0.8773585 0.9422608
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.9290766 0.9677691 0.8134967
## max.AUCpROC.OOB max.Sens.OOB max.Spec.OOB max.AUCROCR.OOB
## 1 0.6272727 0.8181818 0.4363636 0.6417477
## opt.prob.threshold.OOB max.f.score.OOB max.Accuracy.OOB
## 1 0.4 0.4537815 0.731405
## max.AccuracyLower.OOB max.AccuracyUpper.OOB max.Kappa.OOB
## 1 0.6895485 0.7704077 0.2770475
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.01481505 0.05253286
## [1] "Cross Validation issues:"
## Warning in get_dsp_models_df(): Cross Validation issues:
## MFO.myMFO_classfr Random.myrandom_classfr
## 0 0
## Max.cor.Y.rcv.1X1.glmnet Max.cor.Y.rcv.1X1.cp.0.rpart
## 0 0
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing missing values (geom_point).
if (is.null(glb_sel_mdl_id))
glb_sel_mdl_id <- dsp_models_df[1, "id"] else
print(sprintf("User specified selection: %s", glb_sel_mdl_id))
## [1] "User specified selection: Ensemble.glmnet"
myprint_mdl(glb_sel_mdl <- glb_models_lst[[glb_sel_mdl_id]])
## Length Class Mode
## a0 93 -none- numeric
## beta 744 dgCMatrix S4
## df 93 -none- numeric
## dim 2 -none- numeric
## lambda 93 -none- numeric
## dev.ratio 93 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 8 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.772081
## sold.fctr.predict.RFE.X.avNNet.prob
## 18.665537
## sold.fctr.predict.RFE.X.bayesglm.prob
## -5.982486
## sold.fctr.predict.RFE.X.earth.prob
## -2.498722
## sold.fctr.predict.RFE.X.gbm.prob
## -12.574421
## sold.fctr.predict.RFE.X.glm.prob
## 6.329718
## sold.fctr.predict.RFE.X.glmnet.prob
## 4.888828
## sold.fctr.predict.RFE.X.nnet.prob
## 1.865833
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.777437
## sold.fctr.predict.RFE.X.avNNet.prob
## 18.712700
## sold.fctr.predict.RFE.X.bayesglm.prob
## -6.073924
## sold.fctr.predict.RFE.X.earth.prob
## -2.512910
## sold.fctr.predict.RFE.X.gbm.prob
## -12.632938
## sold.fctr.predict.RFE.X.glm.prob
## 6.388467
## sold.fctr.predict.RFE.X.glmnet.prob
## 4.968946
## sold.fctr.predict.RFE.X.nnet.prob
## 1.873890
## [1] TRUE
#stop(here"); glb_to_sav()
# From here to save(), this should all be in one function
# these are executed in the same seq twice more:
# fit.data.training & predict.data.new chunks
print(sprintf("%s fit prediction diagnostics:", glb_sel_mdl_id))
## [1] "Ensemble.glmnet fit prediction diagnostics:"
glb_fitobs_df <- glb_get_predictions(df=glb_fitobs_df, mdl_id=glb_sel_mdl_id,
rsp_var_out=glb_rsp_var_out)
print(sprintf("%s OOB prediction diagnostics:", glb_sel_mdl_id))
## [1] "Ensemble.glmnet OOB prediction diagnostics:"
glb_OOBobs_df <- glb_get_predictions(df = glb_OOBobs_df, mdl_id = glb_sel_mdl_id,
rsp_var_out = glb_rsp_var_out)
glb_featsimp_df <-
myget_feats_importance(mdl=glb_sel_mdl, featsimp_df=NULL)
glb_featsimp_df[, paste0(glb_sel_mdl_id, ".importance")] <- glb_featsimp_df$importance
#mdl_id <-"RFE.X.glmnet"; glb_featsimp_df <- myget_feats_importance(glb_models_lst[[mdl_id]], glb_featsimp_df); glb_featsimp_df[, paste0(mdl_id, ".importance")] <- glb_featsimp_df$importance; print(glb_featsimp_df)
#print(head(sbst_featsimp_df <- subset(glb_featsimp_df, is.na(RFE.X.glmnet.importance) | (abs(RFE.X.YeoJohnson.glmnet.importance - RFE.X.glmnet.importance) > 0.0001), select=-importance)))
#print(orderBy(~ -cor.y.abs, subset(glb_feats_df, id %in% c(row.names(sbst_featsimp_df), "startprice.dcm1.is9", "D.weight.post.stop.sum"))))
print(glb_featsimp_df)
## importance
## sold.fctr.predict.RFE.X.avNNet.prob 100.00000
## sold.fctr.predict.RFE.X.glm.prob 60.59569
## sold.fctr.predict.RFE.X.glmnet.prob 56.02422
## sold.fctr.predict.RFE.X.nnet.prob 46.25126
## sold.fctr.predict.RFE.X.rpart.prob 40.27596
## sold.fctr.predict.RFE.X.earth.prob 32.26855
## sold.fctr.predict.RFE.X.bayesglm.prob 21.01501
## sold.fctr.predict.RFE.X.gbm.prob 0.00000
## Ensemble.glmnet.importance
## sold.fctr.predict.RFE.X.avNNet.prob 100.00000
## sold.fctr.predict.RFE.X.glm.prob 60.59569
## sold.fctr.predict.RFE.X.glmnet.prob 56.02422
## sold.fctr.predict.RFE.X.nnet.prob 46.25126
## sold.fctr.predict.RFE.X.rpart.prob 40.27596
## sold.fctr.predict.RFE.X.earth.prob 32.26855
## sold.fctr.predict.RFE.X.bayesglm.prob 21.01501
## sold.fctr.predict.RFE.X.gbm.prob 0.00000
# Used again in fit.data.training & predict.data.new chunks
glb_analytics_diag_plots <- function(obs_df, mdl_id, prob_threshold=NULL) {
if (!is.null(featsimp_df <- glb_featsimp_df)) {
featsimp_df$feat <- gsub("`(.*?)`", "\\1", row.names(featsimp_df))
featsimp_df$feat.interact <- gsub("(.*?):(.*)", "\\2", featsimp_df$feat)
featsimp_df$feat <- gsub("(.*?):(.*)", "\\1", featsimp_df$feat)
featsimp_df$feat.interact <- ifelse(featsimp_df$feat.interact == featsimp_df$feat,
NA, featsimp_df$feat.interact)
featsimp_df$feat <- gsub("(.*?)\\.fctr(.*)", "\\1\\.fctr", featsimp_df$feat)
featsimp_df$feat.interact <- gsub("(.*?)\\.fctr(.*)", "\\1\\.fctr", featsimp_df$feat.interact)
featsimp_df <- orderBy(~ -importance.max, summaryBy(importance ~ feat + feat.interact,
data=featsimp_df, FUN=max))
#rex_str=":(.*)"; txt_vctr=tail(featsimp_df$feat); ret_lst <- regexec(rex_str, txt_vctr); ret_lst <- regmatches(txt_vctr, ret_lst); ret_vctr <- sapply(1:length(ret_lst), function(pos_ix) ifelse(length(ret_lst[[pos_ix]]) > 0, ret_lst[[pos_ix]], "")); print(ret_vctr <- ret_vctr[ret_vctr != ""])
featsimp_df <- subset(featsimp_df, !is.na(importance.max))
if (nrow(featsimp_df) > 5) {
warning("Limiting important feature scatter plots to 5 out of ", nrow(featsimp_df))
featsimp_df <- head(featsimp_df, 5)
}
# if (!all(is.na(featsimp_df$feat.interact)))
# stop("not implemented yet")
rsp_var_out <- paste0(glb_rsp_var_out, mdl_id)
for (var in featsimp_df$feat) {
plot_df <- melt(obs_df, id.vars=var,
measure.vars=c(glb_rsp_var, rsp_var_out))
# if (var == "<feat_name>") print(myplot_scatter(plot_df, var, "value",
# facet_colcol_name="variable") +
# geom_vline(xintercept=<divider_val>, linetype="dotted")) else
print(myplot_scatter(plot_df, var, "value", colorcol_name="variable",
facet_colcol_name="variable", jitter=TRUE) +
guides(color=FALSE))
}
}
if (glb_is_regression) {
if (is.null(featsimp_df) || (nrow(featsimp_df) == 0))
warning("No important features in glb_fin_mdl") else
print(myplot_prediction_regression(df=obs_df,
feat_x=ifelse(nrow(featsimp_df) > 1, featsimp_df$feat[2],
".rownames"),
feat_y=featsimp_df$feat[1],
rsp_var=glb_rsp_var, rsp_var_out=rsp_var_out,
id_vars=glb_id_var)
# + facet_wrap(reformulate(featsimp_df$feat[2])) # if [1 or 2] is a factor
# + geom_point(aes_string(color="<col_name>.fctr")) # to color the plot
)
}
if (glb_is_classification) {
if (is.null(featsimp_df) || (nrow(featsimp_df) == 0))
warning("No features in selected model are statistically important")
else print(myplot_prediction_classification(df=obs_df,
feat_x=ifelse(nrow(featsimp_df) > 1, featsimp_df$feat[2],
".rownames"),
feat_y=featsimp_df$feat[1],
rsp_var=glb_rsp_var,
rsp_var_out=rsp_var_out,
id_vars=glb_id_var,
prob_threshold=prob_threshold)
# + geom_hline(yintercept=<divider_val>, linetype = "dotted")
)
}
}
if (glb_is_classification && glb_is_binomial)
glb_analytics_diag_plots(obs_df = glb_OOBobs_df, mdl_id = glb_sel_mdl_id,
prob_threshold = glb_models_df[glb_models_df$id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"]) else
glb_analytics_diag_plots(obs_df=glb_OOBobs_df, mdl_id=glb_sel_mdl_id)
## [1] "Min/Max Boundaries: "
## UniqueID sold.fctr sold.fctr.predict.RFE.X.rpart.prob
## 1014 11860 Y 0.2015066
## 1 10001 N 0.2015066
## sold.fctr.predict.RFE.X.rpart sold.fctr.predict.RFE.X.rpart.err
## 1014 Y FALSE
## 1 Y TRUE
## sold.fctr.predict.RFE.X.rpart.err.abs
## 1014 0.7984934
## 1 0.2015066
## sold.fctr.predict.RFE.X.rpart.accurate
## 1014 TRUE
## 1 FALSE
## sold.fctr.predict.RFE.X.earth.prob sold.fctr.predict.RFE.X.earth
## 1014 0.1758691 Y
## 1 0.5000000 Y
## sold.fctr.predict.RFE.X.earth.err
## 1014 FALSE
## 1 TRUE
## sold.fctr.predict.RFE.X.earth.err.abs
## 1014 0.8241309
## 1 0.5000000
## sold.fctr.predict.RFE.X.earth.accurate
## 1014 TRUE
## 1 FALSE
## sold.fctr.predict.RFE.X.glmnet.prob sold.fctr.predict.RFE.X.glmnet
## 1014 0.2781431 Y
## 1 0.3888408 Y
## sold.fctr.predict.RFE.X.glmnet.err
## 1014 FALSE
## 1 TRUE
## sold.fctr.predict.RFE.X.glmnet.err.abs
## 1014 0.7218569
## 1 0.3888408
## sold.fctr.predict.RFE.X.glmnet.accurate
## 1014 TRUE
## 1 FALSE
## sold.fctr.predict.RFE.X.glm.prob sold.fctr.predict.RFE.X.glm
## 1014 0.3140367 Y
## 1 0.1808848 N
## sold.fctr.predict.RFE.X.glm.err sold.fctr.predict.RFE.X.glm.err.abs
## 1014 FALSE 0.6859633
## 1 FALSE 0.1808848
## sold.fctr.predict.RFE.X.glm.accurate
## 1014 TRUE
## 1 TRUE
## sold.fctr.predict.RFE.X.bayesglm.prob
## 1014 0.2422758
## 1 0.2733515
## sold.fctr.predict.RFE.X.bayesglm sold.fctr.predict.RFE.X.bayesglm.err
## 1014 N TRUE
## 1 N FALSE
## sold.fctr.predict.RFE.X.bayesglm.err.abs
## 1014 0.7577242
## 1 0.2733515
## sold.fctr.predict.RFE.X.bayesglm.accurate
## 1014 FALSE
## 1 TRUE
## sold.fctr.predict.RFE.X.nnet.prob sold.fctr.predict.RFE.X.nnet
## 1014 0.07749899 Y
## 1 0.07749899 Y
## sold.fctr.predict.RFE.X.nnet.err sold.fctr.predict.RFE.X.nnet.err.abs
## 1014 FALSE 0.92250101
## 1 TRUE 0.07749899
## sold.fctr.predict.RFE.X.nnet.accurate
## 1014 TRUE
## 1 FALSE
## sold.fctr.predict.RFE.X.avNNet.prob sold.fctr.predict.RFE.X.avNNet
## 1014 0.09718205 N
## 1 0.24267533 N
## sold.fctr.predict.RFE.X.avNNet.err
## 1014 TRUE
## 1 FALSE
## sold.fctr.predict.RFE.X.avNNet.err.abs
## 1014 0.9028179
## 1 0.2426753
## sold.fctr.predict.RFE.X.avNNet.accurate
## 1014 FALSE
## 1 TRUE
## sold.fctr.predict.RFE.X.gbm.prob sold.fctr.predict.RFE.X.gbm
## 1014 0.3836411 Y
## 1 0.4557725 Y
## sold.fctr.predict.RFE.X.gbm.err sold.fctr.predict.RFE.X.gbm.err.abs
## 1014 FALSE 0.6163589
## 1 TRUE 0.4557725
## sold.fctr.predict.RFE.X.gbm.accurate
## 1014 TRUE
## 1 FALSE
## sold.fctr.predict.Ensemble.glmnet.prob
## 1014 0.005585029
## 1 0.009260911
## sold.fctr.predict.Ensemble.glmnet
## 1014 N
## 1 N
## sold.fctr.predict.Ensemble.glmnet.err
## 1014 TRUE
## 1 FALSE
## sold.fctr.predict.Ensemble.glmnet.err.abs
## 1014 0.994414971
## 1 0.009260911
## sold.fctr.predict.Ensemble.glmnet.accurate
## 1014 FALSE
## 1 TRUE
## sold.fctr.predict.Ensemble.glmnet.error .label
## 1014 -0.394415 11860
## 1 0.000000 10001
## [1] "Inaccurate: "
## UniqueID sold.fctr sold.fctr.predict.RFE.X.rpart.prob
## 498 11107 Y 0.2015066
## 24 10030 Y 0.2015066
## 889 11648 Y 0.2015066
## 438 11015 Y 0.2015066
## 654 11319 Y 0.2015066
## 404 10962 Y 0.2015066
## sold.fctr.predict.RFE.X.rpart sold.fctr.predict.RFE.X.rpart.err
## 498 Y FALSE
## 24 Y FALSE
## 889 Y FALSE
## 438 Y FALSE
## 654 Y FALSE
## 404 Y FALSE
## sold.fctr.predict.RFE.X.rpart.err.abs
## 498 0.7984934
## 24 0.7984934
## 889 0.7984934
## 438 0.7984934
## 654 0.7984934
## 404 0.7984934
## sold.fctr.predict.RFE.X.rpart.accurate
## 498 TRUE
## 24 TRUE
## 889 TRUE
## 438 TRUE
## 654 TRUE
## 404 TRUE
## sold.fctr.predict.RFE.X.earth.prob sold.fctr.predict.RFE.X.earth
## 498 0.5000000 Y
## 24 0.5000000 Y
## 889 0.5000000 Y
## 438 0.1758691 Y
## 654 0.1758691 Y
## 404 0.1758691 Y
## sold.fctr.predict.RFE.X.earth.err
## 498 FALSE
## 24 FALSE
## 889 FALSE
## 438 FALSE
## 654 FALSE
## 404 FALSE
## sold.fctr.predict.RFE.X.earth.err.abs
## 498 0.5000000
## 24 0.5000000
## 889 0.5000000
## 438 0.8241309
## 654 0.8241309
## 404 0.8241309
## sold.fctr.predict.RFE.X.earth.accurate
## 498 TRUE
## 24 TRUE
## 889 TRUE
## 438 TRUE
## 654 TRUE
## 404 TRUE
## sold.fctr.predict.RFE.X.glmnet.prob sold.fctr.predict.RFE.X.glmnet
## 498 0.3744272 Y
## 24 0.3017909 Y
## 889 0.4501885 Y
## 438 0.2482708 Y
## 654 0.1901607 N
## 404 0.1680633 N
## sold.fctr.predict.RFE.X.glmnet.err
## 498 FALSE
## 24 FALSE
## 889 FALSE
## 438 FALSE
## 654 TRUE
## 404 TRUE
## sold.fctr.predict.RFE.X.glmnet.err.abs
## 498 0.6255728
## 24 0.6982091
## 889 0.5498115
## 438 0.7517292
## 654 0.8098393
## 404 0.8319367
## sold.fctr.predict.RFE.X.glmnet.accurate
## 498 TRUE
## 24 TRUE
## 889 TRUE
## 438 TRUE
## 654 FALSE
## 404 FALSE
## sold.fctr.predict.RFE.X.glm.prob sold.fctr.predict.RFE.X.glm
## 498 8.522655e-03 N
## 24 2.073974e-11 N
## 889 2.041276e-01 N
## 438 1.801022e-05 N
## 654 4.565013e-02 N
## 404 9.322773e-02 N
## sold.fctr.predict.RFE.X.glm.err sold.fctr.predict.RFE.X.glm.err.abs
## 498 TRUE 0.9914773
## 24 TRUE 1.0000000
## 889 TRUE 0.7958724
## 438 TRUE 0.9999820
## 654 TRUE 0.9543499
## 404 TRUE 0.9067723
## sold.fctr.predict.RFE.X.glm.accurate
## 498 FALSE
## 24 FALSE
## 889 FALSE
## 438 FALSE
## 654 FALSE
## 404 FALSE
## sold.fctr.predict.RFE.X.bayesglm.prob sold.fctr.predict.RFE.X.bayesglm
## 498 0.11525920 N
## 24 0.06113902 N
## 889 0.43210358 Y
## 438 0.08485894 N
## 654 0.42382052 Y
## 404 0.13047100 N
## sold.fctr.predict.RFE.X.bayesglm.err
## 498 TRUE
## 24 TRUE
## 889 FALSE
## 438 TRUE
## 654 FALSE
## 404 TRUE
## sold.fctr.predict.RFE.X.bayesglm.err.abs
## 498 0.8847408
## 24 0.9388610
## 889 0.5678964
## 438 0.9151411
## 654 0.5761795
## 404 0.8695290
## sold.fctr.predict.RFE.X.bayesglm.accurate
## 498 FALSE
## 24 FALSE
## 889 TRUE
## 438 FALSE
## 654 TRUE
## 404 FALSE
## sold.fctr.predict.RFE.X.nnet.prob sold.fctr.predict.RFE.X.nnet
## 498 0.07749899 Y
## 24 0.07749899 Y
## 889 0.58139093 Y
## 438 0.07749899 Y
## 654 0.07749899 Y
## 404 0.07749899 Y
## sold.fctr.predict.RFE.X.nnet.err sold.fctr.predict.RFE.X.nnet.err.abs
## 498 FALSE 0.9225010
## 24 FALSE 0.9225010
## 889 FALSE 0.4186091
## 438 FALSE 0.9225010
## 654 FALSE 0.9225010
## 404 FALSE 0.9225010
## sold.fctr.predict.RFE.X.nnet.accurate
## 498 TRUE
## 24 TRUE
## 889 TRUE
## 438 TRUE
## 654 TRUE
## 404 TRUE
## sold.fctr.predict.RFE.X.avNNet.prob sold.fctr.predict.RFE.X.avNNet
## 498 0.03426104 N
## 24 0.03285631 N
## 889 0.27358667 N
## 438 0.05070146 N
## 654 0.13285463 N
## 404 0.03434568 N
## sold.fctr.predict.RFE.X.avNNet.err
## 498 TRUE
## 24 TRUE
## 889 TRUE
## 438 TRUE
## 654 TRUE
## 404 TRUE
## sold.fctr.predict.RFE.X.avNNet.err.abs
## 498 0.9657390
## 24 0.9671437
## 889 0.7264133
## 438 0.9492985
## 654 0.8671454
## 404 0.9656543
## sold.fctr.predict.RFE.X.avNNet.accurate
## 498 FALSE
## 24 FALSE
## 889 FALSE
## 438 FALSE
## 654 FALSE
## 404 FALSE
## sold.fctr.predict.RFE.X.gbm.prob sold.fctr.predict.RFE.X.gbm
## 498 0.3392700 Y
## 24 0.2940717 Y
## 889 0.6500264 Y
## 438 0.2820575 Y
## 654 0.2192022 Y
## 404 0.2283996 Y
## sold.fctr.predict.RFE.X.gbm.err sold.fctr.predict.RFE.X.gbm.err.abs
## 498 FALSE 0.6607300
## 24 FALSE 0.7059283
## 889 FALSE 0.3499736
## 438 FALSE 0.7179425
## 654 FALSE 0.7807978
## 404 FALSE 0.7716004
## sold.fctr.predict.RFE.X.gbm.accurate
## 498 TRUE
## 24 TRUE
## 889 TRUE
## 438 TRUE
## 654 TRUE
## 404 TRUE
## sold.fctr.predict.Ensemble.glmnet.prob
## 498 0.0006660586
## 24 0.0010519725
## 889 0.0022207689
## 438 0.0025590569
## 654 0.0034113888
## 404 0.0034275688
## sold.fctr.predict.Ensemble.glmnet
## 498 N
## 24 N
## 889 N
## 438 N
## 654 N
## 404 N
## sold.fctr.predict.Ensemble.glmnet.err
## 498 TRUE
## 24 TRUE
## 889 TRUE
## 438 TRUE
## 654 TRUE
## 404 TRUE
## sold.fctr.predict.Ensemble.glmnet.err.abs
## 498 0.9993339
## 24 0.9989480
## 889 0.9977792
## 438 0.9974409
## 654 0.9965886
## 404 0.9965724
## sold.fctr.predict.Ensemble.glmnet.accurate
## 498 FALSE
## 24 FALSE
## 889 FALSE
## 438 FALSE
## 654 FALSE
## 404 FALSE
## sold.fctr.predict.Ensemble.glmnet.error
## 498 -0.3993339
## 24 -0.3989480
## 889 -0.3977792
## 438 -0.3974409
## 654 -0.3965886
## 404 -0.3965724
## UniqueID sold.fctr sold.fctr.predict.RFE.X.rpart.prob
## 703 11389 N 0.2015066
## 872 11599 N 0.2015066
## 918 11733 N 0.2015066
## 326 10807 N 0.2015066
## 444 11023 N 0.2015066
## 86 10116 N 0.2015066
## sold.fctr.predict.RFE.X.rpart sold.fctr.predict.RFE.X.rpart.err
## 703 Y TRUE
## 872 Y TRUE
## 918 Y TRUE
## 326 Y TRUE
## 444 Y TRUE
## 86 Y TRUE
## sold.fctr.predict.RFE.X.rpart.err.abs
## 703 0.2015066
## 872 0.2015066
## 918 0.2015066
## 326 0.2015066
## 444 0.2015066
## 86 0.2015066
## sold.fctr.predict.RFE.X.rpart.accurate
## 703 FALSE
## 872 FALSE
## 918 FALSE
## 326 FALSE
## 444 FALSE
## 86 FALSE
## sold.fctr.predict.RFE.X.earth.prob sold.fctr.predict.RFE.X.earth
## 703 0.1758691 Y
## 872 0.1758691 Y
## 918 0.1758691 Y
## 326 0.1758691 Y
## 444 0.1758691 Y
## 86 0.1758691 Y
## sold.fctr.predict.RFE.X.earth.err
## 703 TRUE
## 872 TRUE
## 918 TRUE
## 326 TRUE
## 444 TRUE
## 86 TRUE
## sold.fctr.predict.RFE.X.earth.err.abs
## 703 0.1758691
## 872 0.1758691
## 918 0.1758691
## 326 0.1758691
## 444 0.1758691
## 86 0.1758691
## sold.fctr.predict.RFE.X.earth.accurate
## 703 FALSE
## 872 FALSE
## 918 FALSE
## 326 FALSE
## 444 FALSE
## 86 FALSE
## sold.fctr.predict.RFE.X.glmnet.prob sold.fctr.predict.RFE.X.glmnet
## 703 0.2428677 Y
## 872 0.1881885 N
## 918 0.2886943 Y
## 326 0.2474664 Y
## 444 0.2586095 Y
## 86 0.1826444 N
## sold.fctr.predict.RFE.X.glmnet.err
## 703 TRUE
## 872 FALSE
## 918 TRUE
## 326 TRUE
## 444 TRUE
## 86 FALSE
## sold.fctr.predict.RFE.X.glmnet.err.abs
## 703 0.2428677
## 872 0.1881885
## 918 0.2886943
## 326 0.2474664
## 444 0.2586095
## 86 0.1826444
## sold.fctr.predict.RFE.X.glmnet.accurate
## 703 FALSE
## 872 TRUE
## 918 FALSE
## 326 FALSE
## 444 FALSE
## 86 TRUE
## sold.fctr.predict.RFE.X.glm.prob sold.fctr.predict.RFE.X.glm
## 703 2.070575e-08 N
## 872 9.555382e-02 N
## 918 7.545098e-02 N
## 326 7.824163e-01 Y
## 444 4.145346e-01 Y
## 86 4.881874e-01 Y
## sold.fctr.predict.RFE.X.glm.err sold.fctr.predict.RFE.X.glm.err.abs
## 703 FALSE 2.070575e-08
## 872 FALSE 9.555382e-02
## 918 FALSE 7.545098e-02
## 326 TRUE 7.824163e-01
## 444 TRUE 4.145346e-01
## 86 TRUE 4.881874e-01
## sold.fctr.predict.RFE.X.glm.accurate
## 703 TRUE
## 872 TRUE
## 918 TRUE
## 326 FALSE
## 444 FALSE
## 86 FALSE
## sold.fctr.predict.RFE.X.bayesglm.prob sold.fctr.predict.RFE.X.bayesglm
## 703 0.1008810 N
## 872 0.6899543 Y
## 918 0.1097052 N
## 326 0.5716128 Y
## 444 0.3546955 Y
## 86 0.4678880 Y
## sold.fctr.predict.RFE.X.bayesglm.err
## 703 FALSE
## 872 TRUE
## 918 FALSE
## 326 TRUE
## 444 TRUE
## 86 TRUE
## sold.fctr.predict.RFE.X.bayesglm.err.abs
## 703 0.1008810
## 872 0.6899543
## 918 0.1097052
## 326 0.5716128
## 444 0.3546955
## 86 0.4678880
## sold.fctr.predict.RFE.X.bayesglm.accurate
## 703 TRUE
## 872 FALSE
## 918 TRUE
## 326 FALSE
## 444 FALSE
## 86 FALSE
## sold.fctr.predict.RFE.X.nnet.prob sold.fctr.predict.RFE.X.nnet
## 703 0.58139093 Y
## 872 0.58139093 Y
## 918 0.07749899 Y
## 326 0.58139093 Y
## 444 0.58139093 Y
## 86 0.58139093 Y
## sold.fctr.predict.RFE.X.nnet.err sold.fctr.predict.RFE.X.nnet.err.abs
## 703 TRUE 0.58139093
## 872 TRUE 0.58139093
## 918 TRUE 0.07749899
## 326 TRUE 0.58139093
## 444 TRUE 0.58139093
## 86 TRUE 0.58139093
## sold.fctr.predict.RFE.X.nnet.accurate
## 703 FALSE
## 872 FALSE
## 918 FALSE
## 326 FALSE
## 444 FALSE
## 86 FALSE
## sold.fctr.predict.RFE.X.avNNet.prob sold.fctr.predict.RFE.X.avNNet
## 703 0.3456999 Y
## 872 0.4981990 Y
## 918 0.4267940 Y
## 326 0.2287781 N
## 444 0.3669225 Y
## 86 0.3359848 Y
## sold.fctr.predict.RFE.X.avNNet.err
## 703 TRUE
## 872 TRUE
## 918 TRUE
## 326 FALSE
## 444 TRUE
## 86 TRUE
## sold.fctr.predict.RFE.X.avNNet.err.abs
## 703 0.3456999
## 872 0.4981990
## 918 0.4267940
## 326 0.2287781
## 444 0.3669225
## 86 0.3359848
## sold.fctr.predict.RFE.X.avNNet.accurate
## 703 FALSE
## 872 FALSE
## 918 FALSE
## 326 TRUE
## 444 FALSE
## 86 FALSE
## sold.fctr.predict.RFE.X.gbm.prob sold.fctr.predict.RFE.X.gbm
## 703 0.3383963 Y
## 872 0.2808213 Y
## 918 0.3846128 Y
## 326 0.2780718 Y
## 444 0.3148874 Y
## 86 0.2086367 Y
## sold.fctr.predict.RFE.X.gbm.err sold.fctr.predict.RFE.X.gbm.err.abs
## 703 TRUE 0.3383963
## 872 TRUE 0.2808213
## 918 TRUE 0.3846128
## 326 TRUE 0.2780718
## 444 TRUE 0.3148874
## 86 TRUE 0.2086367
## sold.fctr.predict.RFE.X.gbm.accurate
## 703 FALSE
## 872 FALSE
## 918 FALSE
## 326 FALSE
## 444 FALSE
## 86 FALSE
## sold.fctr.predict.Ensemble.glmnet.prob
## 703 0.4149219
## 872 0.5049223
## 918 0.5743545
## 326 0.5967471
## 444 0.8224184
## 86 0.8462467
## sold.fctr.predict.Ensemble.glmnet
## 703 Y
## 872 Y
## 918 Y
## 326 Y
## 444 Y
## 86 Y
## sold.fctr.predict.Ensemble.glmnet.err
## 703 TRUE
## 872 TRUE
## 918 TRUE
## 326 TRUE
## 444 TRUE
## 86 TRUE
## sold.fctr.predict.Ensemble.glmnet.err.abs
## 703 0.4149219
## 872 0.5049223
## 918 0.5743545
## 326 0.5967471
## 444 0.8224184
## 86 0.8462467
## sold.fctr.predict.Ensemble.glmnet.accurate
## 703 FALSE
## 872 FALSE
## 918 FALSE
## 326 FALSE
## 444 FALSE
## 86 FALSE
## sold.fctr.predict.Ensemble.glmnet.error
## 703 0.01492185
## 872 0.10492226
## 918 0.17435453
## 326 0.19674715
## 444 0.42241843
## 86 0.44624668
## UniqueID sold.fctr sold.fctr.predict.RFE.X.rpart.prob
## 411 10978 N 0.2015066
## 243 10590 N 0.2015066
## 744 11443 N 0.2015066
## 709 11396 N 0.2015066
## 715 11402 N 0.2015066
## 130 10217 N 0.2015066
## sold.fctr.predict.RFE.X.rpart sold.fctr.predict.RFE.X.rpart.err
## 411 Y TRUE
## 243 Y TRUE
## 744 Y TRUE
## 709 Y TRUE
## 715 Y TRUE
## 130 Y TRUE
## sold.fctr.predict.RFE.X.rpart.err.abs
## 411 0.2015066
## 243 0.2015066
## 744 0.2015066
## 709 0.2015066
## 715 0.2015066
## 130 0.2015066
## sold.fctr.predict.RFE.X.rpart.accurate
## 411 FALSE
## 243 FALSE
## 744 FALSE
## 709 FALSE
## 715 FALSE
## 130 FALSE
## sold.fctr.predict.RFE.X.earth.prob sold.fctr.predict.RFE.X.earth
## 411 0.1758691 Y
## 243 0.1758691 Y
## 744 0.1758691 Y
## 709 0.1758691 Y
## 715 0.1758691 Y
## 130 0.1758691 Y
## sold.fctr.predict.RFE.X.earth.err
## 411 TRUE
## 243 TRUE
## 744 TRUE
## 709 TRUE
## 715 TRUE
## 130 TRUE
## sold.fctr.predict.RFE.X.earth.err.abs
## 411 0.1758691
## 243 0.1758691
## 744 0.1758691
## 709 0.1758691
## 715 0.1758691
## 130 0.1758691
## sold.fctr.predict.RFE.X.earth.accurate
## 411 FALSE
## 243 FALSE
## 744 FALSE
## 709 FALSE
## 715 FALSE
## 130 FALSE
## sold.fctr.predict.RFE.X.glmnet.prob sold.fctr.predict.RFE.X.glmnet
## 411 0.2560035 Y
## 243 0.2000869 Y
## 744 0.6500655 Y
## 709 0.2621345 Y
## 715 0.2430380 Y
## 130 0.7081611 Y
## sold.fctr.predict.RFE.X.glmnet.err
## 411 TRUE
## 243 TRUE
## 744 TRUE
## 709 TRUE
## 715 TRUE
## 130 TRUE
## sold.fctr.predict.RFE.X.glmnet.err.abs
## 411 0.2560035
## 243 0.2000869
## 744 0.6500655
## 709 0.2621345
## 715 0.2430380
## 130 0.7081611
## sold.fctr.predict.RFE.X.glmnet.accurate
## 411 FALSE
## 243 FALSE
## 744 FALSE
## 709 FALSE
## 715 FALSE
## 130 FALSE
## sold.fctr.predict.RFE.X.glm.prob sold.fctr.predict.RFE.X.glm
## 411 0.7381305 Y
## 243 0.7970268 Y
## 744 1.0000000 Y
## 709 0.9169169 Y
## 715 0.6778090 Y
## 130 1.0000000 Y
## sold.fctr.predict.RFE.X.glm.err sold.fctr.predict.RFE.X.glm.err.abs
## 411 TRUE 0.7381305
## 243 TRUE 0.7970268
## 744 TRUE 1.0000000
## 709 TRUE 0.9169169
## 715 TRUE 0.6778090
## 130 TRUE 1.0000000
## sold.fctr.predict.RFE.X.glm.accurate
## 411 FALSE
## 243 FALSE
## 744 FALSE
## 709 FALSE
## 715 FALSE
## 130 FALSE
## sold.fctr.predict.RFE.X.bayesglm.prob sold.fctr.predict.RFE.X.bayesglm
## 411 0.4764815 Y
## 243 0.7225393 Y
## 744 0.9451872 Y
## 709 0.7260382 Y
## 715 0.4876474 Y
## 130 0.9560858 Y
## sold.fctr.predict.RFE.X.bayesglm.err
## 411 TRUE
## 243 TRUE
## 744 TRUE
## 709 TRUE
## 715 TRUE
## 130 TRUE
## sold.fctr.predict.RFE.X.bayesglm.err.abs
## 411 0.4764815
## 243 0.7225393
## 744 0.9451872
## 709 0.7260382
## 715 0.4876474
## 130 0.9560858
## sold.fctr.predict.RFE.X.bayesglm.accurate
## 411 FALSE
## 243 FALSE
## 744 FALSE
## 709 FALSE
## 715 FALSE
## 130 FALSE
## sold.fctr.predict.RFE.X.nnet.prob sold.fctr.predict.RFE.X.nnet
## 411 0.58139093 Y
## 243 0.58139093 Y
## 744 0.07749899 Y
## 709 0.58139093 Y
## 715 0.58139093 Y
## 130 0.58139093 Y
## sold.fctr.predict.RFE.X.nnet.err sold.fctr.predict.RFE.X.nnet.err.abs
## 411 TRUE 0.58139093
## 243 TRUE 0.58139093
## 744 TRUE 0.07749899
## 709 TRUE 0.58139093
## 715 TRUE 0.58139093
## 130 TRUE 0.58139093
## sold.fctr.predict.RFE.X.nnet.accurate
## 411 FALSE
## 243 FALSE
## 744 FALSE
## 709 FALSE
## 715 FALSE
## 130 FALSE
## sold.fctr.predict.RFE.X.avNNet.prob sold.fctr.predict.RFE.X.avNNet
## 411 0.4937672 Y
## 243 0.5106608 Y
## 744 0.5020171 Y
## 709 0.5139877 Y
## 715 0.5139283 Y
## 130 0.5126690 Y
## sold.fctr.predict.RFE.X.avNNet.err
## 411 TRUE
## 243 TRUE
## 744 TRUE
## 709 TRUE
## 715 TRUE
## 130 TRUE
## sold.fctr.predict.RFE.X.avNNet.err.abs
## 411 0.4937672
## 243 0.5106608
## 744 0.5020171
## 709 0.5139877
## 715 0.5139283
## 130 0.5126690
## sold.fctr.predict.RFE.X.avNNet.accurate
## 411 FALSE
## 243 FALSE
## 744 FALSE
## 709 FALSE
## 715 FALSE
## 130 FALSE
## sold.fctr.predict.RFE.X.gbm.prob sold.fctr.predict.RFE.X.gbm
## 411 0.2780718 Y
## 243 0.1877549 N
## 744 0.2291950 Y
## 709 0.2161938 Y
## 715 0.1877549 N
## 130 0.2977911 Y
## sold.fctr.predict.RFE.X.gbm.err sold.fctr.predict.RFE.X.gbm.err.abs
## 411 TRUE 0.2780718
## 243 FALSE 0.1877549
## 744 TRUE 0.2291950
## 709 TRUE 0.2161938
## 715 FALSE 0.1877549
## 130 TRUE 0.2977911
## sold.fctr.predict.RFE.X.gbm.accurate
## 411 FALSE
## 243 TRUE
## 744 FALSE
## 709 FALSE
## 715 TRUE
## 130 FALSE
## sold.fctr.predict.Ensemble.glmnet.prob
## 411 0.9965914
## 243 0.9968208
## 744 0.9981451
## 709 0.9984968
## 715 0.9987437
## 130 0.9988711
## sold.fctr.predict.Ensemble.glmnet
## 411 Y
## 243 Y
## 744 Y
## 709 Y
## 715 Y
## 130 Y
## sold.fctr.predict.Ensemble.glmnet.err
## 411 TRUE
## 243 TRUE
## 744 TRUE
## 709 TRUE
## 715 TRUE
## 130 TRUE
## sold.fctr.predict.Ensemble.glmnet.err.abs
## 411 0.9965914
## 243 0.9968208
## 744 0.9981451
## 709 0.9984968
## 715 0.9987437
## 130 0.9988711
## sold.fctr.predict.Ensemble.glmnet.accurate
## 411 FALSE
## 243 FALSE
## 744 FALSE
## 709 FALSE
## 715 FALSE
## 130 FALSE
## sold.fctr.predict.Ensemble.glmnet.error
## 411 0.5965914
## 243 0.5968208
## 744 0.5981451
## 709 0.5984968
## 715 0.5987437
## 130 0.5988711
glb_ctgry_df <- merge(glb_ctgry_df,
myget_category_stats(obs_df = glb_fitobs_df, mdl_id = glb_sel_mdl_id, label = "fit"),
by = glb_category_var, all = TRUE)
row.names(glb_ctgry_df) <- glb_ctgry_df[, glb_category_var]
glb_ctgry_df <- merge(glb_ctgry_df,
myget_category_stats(obs_df=glb_OOBobs_df, mdl_id=glb_sel_mdl_id, label="OOB"),
#by=glb_category_var, all=TRUE) glb_ctgry-df already contains .n.OOB ?
all=TRUE)
row.names(glb_ctgry_df) <- glb_ctgry_df[, glb_category_var]
if (any(grepl("OOB", glbMdlMetricsEval)))
print(orderBy(~-err.abs.OOB.mean, glb_ctgry_df)) else
print(orderBy(~-err.abs.fit.mean, glb_ctgry_df))
## prdl.descr.my.fctr .n.OOB .n.Tst .freqRatio.Tst .freqRatio.OOB
## iPadmini3#0 iPadmini3#0 21 18 0.04265403 0.04338843
## iPadmini2#1 iPadmini2#1 9 8 0.01895735 0.01859504
## iPad2#0 iPad2#0 35 31 0.07345972 0.07231405
## iPad4#0 iPad4#0 21 18 0.04265403 0.04338843
## Unknown#0 Unknown#0 35 31 0.07345972 0.07231405
## Unknown#1 Unknown#1 26 23 0.05450237 0.05371901
## iPad1#0 iPad1#0 18 16 0.03791469 0.03719008
## iPadAir2#0 iPadAir2#0 32 28 0.06635071 0.06611570
## iPadmini2#0 iPadmini2#0 17 14 0.03317536 0.03512397
## iPad1#1 iPad1#1 26 22 0.05213270 0.05371901
## iPad2#1 iPad2#1 59 53 0.12559242 0.12190083
## iPadmini#0 iPadmini#0 30 26 0.06161137 0.06198347
## iPad3#1 iPad3#1 23 20 0.04739336 0.04752066
## iPad4#1 iPad4#1 27 24 0.05687204 0.05578512
## iPadmini#1 iPadmini#1 31 26 0.06161137 0.06404959
## iPadAir#1 iPadAir#1 22 19 0.04502370 0.04545455
## iPad3#0 iPad3#0 14 12 0.02843602 0.02892562
## iPadAir#0 iPadAir#0 22 19 0.04502370 0.04545455
## iPadmini3#1 iPadmini3#1 7 6 0.01421801 0.01446281
## iPadAir2#1 iPadAir2#1 9 8 0.01895735 0.01859504
## err.abs.fit.sum err.abs.fit.mean .n.fit err.abs.OOB.sum
## iPadmini3#0 2.26890314 0.070903223 32 9.080860
## iPadmini2#1 0.90301012 0.082091829 11 3.602456
## iPad2#0 1.46974670 0.077355089 19 12.909094
## iPad4#0 2.70907924 0.193505660 14 7.564998
## Unknown#0 4.82909486 0.150909214 32 12.443978
## Unknown#1 3.11621011 0.115415189 27 8.854515
## iPad1#0 4.03995924 0.109188088 37 5.970610
## iPadAir2#0 5.52284739 0.117507391 47 9.281717
## iPadmini2#0 2.82227032 0.117594597 24 4.856905
## iPad1#1 1.45599724 0.055999894 26 7.231008
## iPad2#1 1.35982569 0.041206839 33 16.002257
## iPadmini#0 5.95122836 0.097561121 61 7.743971
## iPad3#1 4.00207996 0.117708234 34 5.782836
## iPad4#1 1.47089112 0.047448101 31 6.720209
## iPadmini#1 1.28096147 0.067419025 19 7.440098
## iPadAir#1 0.58445407 0.018853357 31 4.660690
## iPad3#0 0.04356988 0.008713976 5 2.610921
## iPadAir#0 2.81582512 0.090833068 31 4.033037
## iPadmini3#1 NA NA NA 1.029511
## iPadAir2#1 1.20189395 0.070699644 17 1.128339
## err.abs.OOB.mean
## iPadmini3#0 0.4324219
## iPadmini2#1 0.4002729
## iPad2#0 0.3688313
## iPad4#0 0.3602380
## Unknown#0 0.3555422
## Unknown#1 0.3405583
## iPad1#0 0.3317006
## iPadAir2#0 0.2900537
## iPadmini2#0 0.2857003
## iPad1#1 0.2781157
## iPad2#1 0.2712247
## iPadmini#0 0.2581324
## iPad3#1 0.2514277
## iPad4#1 0.2488966
## iPadmini#1 0.2400032
## iPadAir#1 0.2118495
## iPad3#0 0.1864944
## iPadAir#0 0.1833198
## iPadmini3#1 0.1470729
## iPadAir2#1 0.1253710
print(colSums(glb_ctgry_df[, -grep(glb_category_var, names(glb_ctgry_df))]))
## .n.OOB .n.Tst .freqRatio.Tst .freqRatio.OOB
## 484.000000 422.000000 1.000000 1.000000
## err.abs.fit.sum err.abs.fit.mean .n.fit err.abs.OOB.sum
## NA NA NA 138.948009
## err.abs.OOB.mean
## 5.567227
write.csv(glb_OOBobs_df[, c(glb_id_var,
grep(glb_rsp_var, names(glb_OOBobs_df), fixed=TRUE, value=TRUE))],
paste0(gsub(".", "_", paste0(glb_out_pfx, glb_sel_mdl_id), fixed=TRUE),
"_OOBobs.csv"), row.names=FALSE)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.models", major.inc=FALSE)
## label step_major step_minor label_minor bgn end elapsed
## 12 fit.models 6 2 2 570.369 649.985 79.616
## 13 fit.models 6 3 3 649.986 NA NA
# if (sum(is.na(glb_allobs_df$D.P.http)) > 0)
# stop("fit.models_3: Why is this happening ?")
#stop(here"); glb_to_sav()
sync_glb_obs_df <- function() {
# Merge or cbind ?
for (col in setdiff(names(glb_fitobs_df), names(glb_trnobs_df)))
glb_trnobs_df[glb_trnobs_df$.lcn == "Fit", col] <<- glb_fitobs_df[, col]
for (col in setdiff(names(glb_fitobs_df), names(glb_allobs_df)))
glb_allobs_df[glb_allobs_df$.lcn == "Fit", col] <<- glb_fitobs_df[, col]
if (all(is.na(glb_newobs_df[, glb_rsp_var])))
for (col in setdiff(names(glb_OOBobs_df), names(glb_trnobs_df)))
glb_trnobs_df[glb_trnobs_df$.lcn == "OOB", col] <<- glb_OOBobs_df[, col]
for (col in setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
glb_allobs_df[glb_allobs_df$.lcn == "OOB", col] <<- glb_OOBobs_df[, col]
}
sync_glb_obs_df()
print(setdiff(names(glb_newobs_df), names(glb_allobs_df)))
## character(0)
if (glb_save_envir)
save(glb_feats_df,
glb_allobs_df, #glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
glb_models_df, dsp_models_df, glb_models_lst, glb_sel_mdl, glb_sel_mdl_id,
glb_model_type,
file=paste0(glb_out_pfx, "selmdl_dsk.RData"))
#load(paste0(glb_out_pfx, "selmdl_dsk.RData"))
rm(ret_lst)
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"model.selected")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc=TRUE)
## label step_major step_minor label_minor bgn end
## 13 fit.models 6 3 3 649.986 659.988
## 14 fit.data.training 7 0 0 659.988 NA
## elapsed
## 13 10.002
## 14 NA
7.0: fit data training#load(paste0(glb_inp_pfx, "dsk.RData"))
#stop(here"); glb_to_sav()
if (!is.null(glb_fin_mdl_id) && (glb_fin_mdl_id %in% names(glb_models_lst))) {
warning("Final model same as user selected model")
glb_fin_mdl <- glb_models_lst[[glb_fin_mdl_id]]
} else if (nrow(glb_fitobs_df) + length(glb_obsfit_outliers) == nrow(glb_trnobs_df))
# This condition should be for checking NAs for glb_rsp_var in glb_newobs_df instead ?
# if (!any(is.na(glb_newobs_df[, glb_rsp_var])))
{
warning("Final model same as glb_sel_mdl_id")
glb_fin_mdl_id <- paste0("Final.", glb_sel_mdl_id)
glb_fin_mdl <- glb_sel_mdl
glb_models_lst[[glb_fin_mdl_id]] <- glb_fin_mdl
} else {
if (grepl("RFE", glb_sel_mdl_id) ||
(!is.null(glb_mdl_ensemble) && grepl("RFE", glb_mdl_ensemble))) {
indep_vars <- myadjust_interaction_feats(subset(glb_feats_df,
!nzv & (exclude.as.feat != 1))[, "id"])
rfe_trn_results <- myrun_rfe(glb_trnobs_df, indep_vars, glb_rfe_fit_sizes)
if (!isTRUE(all.equal(sort(predictors(rfe_trn_results)),
sort(predictors(rfe_fit_results))))) {
print("Diffs predictors(rfe_trn_results) vs. predictors(rfe_fit_results):")
print(setdiff(predictors(rfe_trn_results), predictors(rfe_fit_results)))
print("Diffs predictors(rfe_fit_results) vs. predictors(rfe_trn_results):")
print(setdiff(predictors(rfe_fit_results), predictors(rfe_trn_results)))
}
}
if (grepl("Ensemble", glb_sel_mdl_id)) {
# Find which models are relevant
mdlimp_df <- subset(myget_feats_importance(glb_sel_mdl), importance > 5)
# Fit selected models on glb_trnobs_df
for (mdl_id in gsub(".prob", "",
gsub(glb_rsp_var_out, "", row.names(mdlimp_df), fixed = TRUE),
fixed = TRUE)) {
mdl_id_components <- unlist(strsplit(mdl_id, "[.]"))
mdlIdPfx <- paste0(c(head(mdl_id_components, -1), "Train"),
collapse = ".")
if (grepl("RFE\\.X\\.", mdlIdPfx))
mdlIndepVars <- myadjust_interaction_feats(myextract_actual_feats(
predictors(rfe_trn_results))) else
mdlIndepVars <- trim(unlist(
strsplit(glb_models_df[glb_models_df$id == mdl_id, "feats"], "[,]")))
ret_lst <-
myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = mdlIdPfx,
type = glb_model_type, tune.df = glb_tune_models_df,
trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = tail(mdl_id_components, 1))),
indep_vars = mdlIndepVars,
rsp_var = glb_rsp_var,
fit_df = glb_trnobs_df, OOB_df = NULL)
glb_trnobs_df <- glb_get_predictions(df = glb_trnobs_df,
mdl_id = tail(glb_models_df$id, 1),
rsp_var_out = glb_rsp_var_out,
prob_threshold_def =
subset(glb_models_df, id == mdl_id)$opt.prob.threshold.OOB)
glb_newobs_df <- glb_get_predictions(df = glb_newobs_df,
mdl_id = tail(glb_models_df$id, 1),
rsp_var_out = glb_rsp_var_out,
prob_threshold_def =
subset(glb_models_df, id == mdl_id)$opt.prob.threshold.OOB)
}
}
print("***************")
print("Outliers in obsTrnDF not deleted yet")
print("***************")
# "Final" model
if ((model_method <- glb_sel_mdl$method) == "custom")
# get actual method from the mdl_id
model_method <- tail(unlist(strsplit(glb_sel_mdl_id, "[.]")), 1)
if (grepl("Ensemble", glb_sel_mdl_id)) {
# Find which models are relevant
mdlimp_df <- subset(myget_feats_importance(glb_sel_mdl), importance > 5)
if (glb_is_classification && glb_is_binomial)
indep_vars_vctr <- gsub("(.*)\\.(.*)\\.prob", "\\1\\.Train\\.\\2\\.prob",
row.names(mdlimp_df)) else
indep_vars_vctr <- gsub("(.*)\\.(.*)", "\\1\\.Train\\.\\2",
row.names(mdlimp_df))
} else indep_vars_vctr <-
trim(unlist(strsplit(glb_models_df[glb_models_df$id ==
glb_sel_mdl_id
, "feats"], "[,]")))
# Discontinuing use of tune_finmdl_df;
# since final model needs to be cved on glb_trnobs_df
tune_finmdl_df <- NULL
if (nrow(glb_sel_mdl$bestTune) > 0) {
for (param in names(glb_sel_mdl$bestTune)) {
#print(sprintf("param: %s", param))
if (glb_sel_mdl$bestTune[1, param] != "none")
tune_finmdl_df <- rbind(tune_finmdl_df,
data.frame(parameter=param,
min=glb_sel_mdl$bestTune[1, param],
max=glb_sel_mdl$bestTune[1, param],
by=1)) # by val does not matter
}
}
# Sync with parameters in mydsutils.R
#stop(here"); glb_to_sav(); glb_models_lst <- sav_models_lst; glb_models_df <- sav_models_df
if (!is.null(glb_preproc_methods) &&
((match_pos <- regexpr(gsub(".", "\\.",
paste(glb_preproc_methods, collapse = "|"),
fixed = TRUE), glb_sel_mdl_id)) != -1))
ths_preProcess <- str_sub(glb_sel_mdl_id, match_pos,
match_pos + attr(match_pos, "match.length") - 1) else
ths_preProcess <- NULL
fit_trnobs_df <- if (is.null(glb_obstrn_outliers)) glb_trnobs_df else
glb_trnobs_df[!(glb_trnobs_df[, glb_id_var] %in% glb_obstrn_outliers), ]
# Force fitting of Final.glm to identify outliers
method_vctr <- unique(c("glm", tail(unlist(strsplit(glb_sel_mdl_id, "[.]")), 1)))
for (method in method_vctr) {
#source("caret_nominalTrainWorkflow.R")
# glmnet requires at least 2 indep vars
if ((length(indep_vars_vctr) == 1) && (method %in% "glmnet"))
next
ret_lst <-
myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst = list(
id.prefix = ifelse(grepl("Ensemble", glb_sel_mdl_id),
"Final.Ensemble", "Final"),
type = glb_model_type, trainControl.method = "repeatedcv",
trainControl.number = glb_rcv_n_folds,
trainControl.repeats = glb_rcv_n_repeats,
trainControl.classProbs = glb_is_classification,
trainControl.summaryFunction = glbMdlMetricSummaryFn,
train.metric = glbMdlMetricSummary,
train.maximize = glbMdlMetricMaximize,
train.method = method,
train.preProcess = ths_preProcess)),
indep_vars = indep_vars_vctr, rsp_var = glb_rsp_var,
fit_df = fit_trnobs_df, OOB_df = NULL)
}
if ((length(method_vctr) == 1) || (method != "glm")) {
glb_fin_mdl <- glb_models_lst[[length(glb_models_lst)]]
glb_fin_mdl_id <- glb_models_df[length(glb_models_lst), "id"]
}
}
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (3 fold, repeated 3 times)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 47 0.7938 0.1954 0.008046 0.02966
## 48 0.7925 0.1842 0.008727 0.03665
## 49 0.7924 0.2032 0.008059 0.02723
## 50 0.7944 0.2039 0.010348 0.03541
## 51 0.7951 0.2090 0.009372 0.03125 *
## 52 0.7895 0.1863 0.009388 0.03557
## 53 0.7905 0.1901 0.012569 0.04169
## 54 0.7928 0.1960 0.012366 0.04322
## 140 0.7934 0.1443 0.007760 0.03680
##
## The top 5 variables (out of 51):
## spdiff.cut.fctr(-1e+03,-100], spdiff.cut.fctr(-10,-1], spdiff.cut.fctr(-100,-10], spdiff.cut.fctr(-1,0], sprice.log10
##
## [1] "spdiff.cut.fctr(-1e+03,-100]"
## [2] "spdiff.cut.fctr(-10,-1]"
## [3] "spdiff.cut.fctr(-100,-10]"
## [4] "spdiff.cut.fctr(-1,0]"
## [5] "sprice.log10"
## [6] "sprice.d20nexp"
## [7] "sprice.root2"
## [8] "spdiff.cut.fctr(10,100]"
## [9] "spdiff.cut.fctr(100,1e+03]"
## [10] "D.terms.post.stop.n.log"
## [11] "spdiff.cut.fctr(1,10]"
## [12] "condition.fctrNew"
## [13] "D.wrds.unq.n.log"
## [14] "D.terms.post.stem.n.log"
## [15] "prdl.descr.my.fctriPadmini#1:.clusterid.fctr3"
## [16] "D.ratio.wrds.stop.n.wrds.n"
## [17] "prdl.descr.my.fctriPad1#1:.clusterid.fctr3"
## [18] "D.chrs.n.log"
## [19] "D.chrs.uppr.n.log"
## [20] "prdl.descr.my.fctriPadmini2#0"
## [21] "prdl.descr.my.fctrUnknown#1:.clusterid.fctr3"
## [22] "D.ratio.weight.sum.wrds.n"
## [23] "startprice.dcm1.is9"
## [24] "D.weight.post.stop.sum"
## [25] "D.weight.sum"
## [26] "prdl.descr.my.fctriPadAir2#1"
## [27] "D.wrds.n.log"
## [28] "D.weight.post.stem.sum"
## [29] "prdl.descr.my.fctrUnknown#1"
## [30] "cellular.fctr1"
## [31] "prdl.descr.my.fctriPadmini#0"
## [32] "storage.fctr16"
## [33] "cellular.fctr0:carrier.fctrNone"
## [34] "condition.fctrFor parts or not working"
## [35] "prdl.descr.my.fctriPad1#1"
## [36] "prdl.descr.my.fctrUnknown#1:.clusterid.fctr2"
## [37] "D.weight.sum.stem.stop.Ratio"
## [38] "startprice.dcm2.is9"
## [39] "prdl.descr.my.fctriPadAir2#1:.clusterid.fctr2"
## [40] "storage.fctrUnknown"
## [41] "color.fctrWhite"
## [42] "prdl.descr.my.fctriPad3#1"
## [43] "cellular.fctrUnknown"
## [44] "prdl.descr.my.fctriPad4#1:.clusterid.fctr2"
## [45] "prdl.descr.my.fctriPadAir#0"
## [46] "D.wrds.stop.n.log"
## [47] "D.chrs.pnct13.n.log"
## [48] "D.chrs.pnct11.n.log"
## [49] "color.fctrGold"
## [50] "startprice.dgt1.is9"
## [51] "cellular.fctrUnknown:carrier.fctrUnknown"
## [1] "Diffs predictors(rfe_trn_results) vs. predictors(rfe_fit_results):"
## [1] "prdl.descr.my.fctrUnknown#1:.clusterid.fctr2"
## [2] "D.weight.sum.stem.stop.Ratio"
## [3] "startprice.dcm2.is9"
## [4] "storage.fctrUnknown"
## [5] "prdl.descr.my.fctriPad3#1"
## [6] "prdl.descr.my.fctriPad4#1:.clusterid.fctr2"
## [7] "D.chrs.pnct13.n.log"
## [8] "color.fctrGold"
## [9] "startprice.dgt1.is9"
## [1] "Diffs predictors(rfe_fit_results) vs. predictors(rfe_trn_results):"
## [1] "prdl.descr.my.fctriPad3#0"
## [2] "prdl.descr.my.fctriPadmini3#0"
## [3] "prdl.descr.my.fctriPad4#0"
## [4] "prdl.descr.my.fctriPadAir#1"
## [5] "prdl.descr.my.fctriPad2#1:.clusterid.fctr2"
## [6] "prdl.descr.my.fctriPadmini#1:.clusterid.fctr2"
## [7] "cellular.fctr1:carrier.fctrVerizon"
## [8] "cellular.fctr1:carrier.fctrUnknown"
## [9] "prdl.descr.my.fctriPadAir2#0"
## [10] "startprice.dgt2.is9"
## [11] "cellular.fctr1:carrier.fctrT-Mobile"
## [1] "fitting model: RFE.X.Train.avNNet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.log10,sprice.d20nexp,sprice.root2,D.terms.post.stop.n.log,condition.fctr,D.wrds.unq.n.log,D.terms.post.stem.n.log,prdl.descr.my.fctr,D.ratio.wrds.stop.n.wrds.n,D.chrs.n.log,D.chrs.uppr.n.log,D.ratio.weight.sum.wrds.n,startprice.dcm1.is9,D.weight.post.stop.sum,D.weight.sum,D.wrds.n.log,D.weight.post.stem.sum,cellular.fctr,storage.fctr,D.weight.sum.stem.stop.Ratio,startprice.dcm2.is9,color.fctr,D.wrds.stop.n.log,D.chrs.pnct13.n.log,D.chrs.pnct11.n.log,startprice.dgt1.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Aggregating results
## Warning in train.default(x, y, weights = w, ...): missing values found in
## aggregated results
## Selecting tuning parameters
## Fitting size = 7, decay = 1e-04, bag = FALSE on full training set
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdlIdPfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: bag
## Length Class Mode
## model 5 -none- list
## repeats 1 -none- numeric
## bag 1 -none- logical
## names 137 -none- character
## terms 3 terms call
## coefnames 137 -none- character
## xlevels 0 -none- list
## xNames 137 -none- character
## problemType 1 -none- character
## tuneValue 3 data.frame list
## obsLevels 2 -none- character
## sold.fctr sold.fctr.predict.RFE.X.Train.avNNet.N
## 1 N 753
## 2 Y 64
## sold.fctr.predict.RFE.X.Train.avNNet.Y
## 1 45
## 2 153
## Prediction
## Reference N Y
## N 753 45
## Y 64 153
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.926108e-01 6.700349e-01 8.719201e-01 9.109901e-01 7.862069e-01
## AccuracyPValue McnemarPValue
## 3.222827e-19 8.469208e-02
## id
## 1 RFE.X.Train.avNNet
## feats
## 1 spdiff.cut.fctr,sprice.log10,sprice.d20nexp,sprice.root2,D.terms.post.stop.n.log,condition.fctr,D.wrds.unq.n.log,D.terms.post.stem.n.log,prdl.descr.my.fctr,D.ratio.wrds.stop.n.wrds.n,D.chrs.n.log,D.chrs.uppr.n.log,D.ratio.weight.sum.wrds.n,startprice.dcm1.is9,D.weight.post.stop.sum,D.weight.sum,D.wrds.n.log,D.weight.post.stem.sum,cellular.fctr,storage.fctr,D.weight.sum.stem.stop.Ratio,startprice.dcm2.is9,color.fctr,D.wrds.stop.n.log,D.chrs.pnct13.n.log,D.chrs.pnct11.n.log,startprice.dgt1.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 95.895 2.592
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.7467661 0.9912281 0.5023041 0.8923721
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.3 0.7373494 0.7898177
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.8719201 0.9109901 0.1487353
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.01277017 0.09477584
## Warning in glb_get_predictions(df = glb_trnobs_df, mdl_id =
## tail(glb_models_df$id, : Using default probability threshold: 0.3
## Warning in glb_get_predictions(df = glb_newobs_df, mdl_id =
## tail(glb_models_df$id, : Using default probability threshold: 0.3
## [1] "fitting model: RFE.X.Train.glm"
## [1] " indep_vars: spdiff.cut.fctr,sprice.log10,sprice.d20nexp,sprice.root2,D.terms.post.stop.n.log,condition.fctr,D.wrds.unq.n.log,D.terms.post.stem.n.log,prdl.descr.my.fctr,D.ratio.wrds.stop.n.wrds.n,D.chrs.n.log,D.chrs.uppr.n.log,D.ratio.weight.sum.wrds.n,startprice.dcm1.is9,D.weight.post.stop.sum,D.weight.sum,D.wrds.n.log,D.weight.post.stem.sum,cellular.fctr,storage.fctr,D.weight.sum.stem.stop.Ratio,startprice.dcm2.is9,color.fctr,D.wrds.stop.n.log,D.chrs.pnct13.n.log,D.chrs.pnct11.n.log,startprice.dgt1.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## + Fold1.Rep1: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold1.Rep1: parameter=none
## + Fold2.Rep1: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold2.Rep1: parameter=none
## + Fold3.Rep1: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold3.Rep1: parameter=none
## + Fold1.Rep2: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold1.Rep2: parameter=none
## + Fold2.Rep2: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold2.Rep2: parameter=none
## + Fold3.Rep2: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold3.Rep2: parameter=none
## + Fold1.Rep3: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold1.Rep3: parameter=none
## + Fold2.Rep3: parameter=none
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold2.Rep3: parameter=none
## + Fold3.Rep3: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold3.Rep3: parameter=none
## Aggregating results
## Fitting final model on full training set
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8925 -0.6209 -0.2935 -0.0412 3.3455
##
## Coefficients: (50 not defined because of singularities)
## Estimate Std. Error
## (Intercept) -2.835e+00 2.414e+01
## D.chrs.n.log -8.370e+00 3.579e+00
## D.chrs.pnct11.n.log 1.765e-01 4.137e-01
## D.chrs.pnct13.n.log -1.938e-01 3.829e-01
## D.chrs.uppr.n.log 4.007e+00 2.738e+00
## D.ratio.weight.sum.wrds.n -5.836e-01 3.990e-01
## D.ratio.wrds.stop.n.wrds.n -1.360e+00 4.876e+00
## D.terms.post.stem.n.log 7.033e+00 1.098e+01
## D.terms.post.stop.n.log -1.032e+01 1.084e+01
## D.weight.post.stem.sum -1.674e+00 4.197e+00
## D.weight.post.stop.sum 1.681e+00 4.078e+00
## D.weight.sum NA NA
## D.weight.sum.stem.stop.Ratio 5.575e+00 2.341e+01
## D.wrds.n.log 6.064e+00 2.442e+00
## D.wrds.stop.n.log 1.473e-01 7.661e-01
## D.wrds.unq.n.log NA NA
## cellular.fctr1 4.201e-01 3.336e-01
## cellular.fctrUnknown -5.066e-01 5.214e-01
## color.fctrGold -3.606e-01 6.437e-01
## `color.fctrSpace Gray` 1.865e-02 4.159e-01
## color.fctrUnknown -1.529e-03 2.697e-01
## color.fctrWhite 4.618e-02 3.126e-01
## `condition.fctrFor parts or not working` -1.552e-01 5.232e-01
## `condition.fctrManufacturer refurbished` 1.212e-01 6.536e-01
## condition.fctrNew 2.765e-02 3.839e-01
## `condition.fctrNew other (see details)` 1.072e+00 4.997e-01
## `condition.fctrSeller refurbished` -6.338e-01 4.428e-01
## `prdl.descr.my.fctrUnknown#1` 1.009e+01 4.187e+00
## `prdl.descr.my.fctriPad1#0` -1.477e+00 7.412e-01
## `prdl.descr.my.fctriPad1#1` 8.080e+00 4.124e+00
## `prdl.descr.my.fctriPad2#0` -5.342e-01 6.417e-01
## `prdl.descr.my.fctriPad2#1` 9.459e+00 4.118e+00
## `prdl.descr.my.fctriPad3#0` -1.178e+00 8.711e-01
## `prdl.descr.my.fctriPad3#1` 1.002e+01 4.178e+00
## `prdl.descr.my.fctriPad4#0` 5.046e-02 6.651e-01
## `prdl.descr.my.fctriPad4#1` 1.207e+01 4.158e+00
## `prdl.descr.my.fctriPadAir#0` 7.281e-01 6.640e-01
## `prdl.descr.my.fctriPadAir#1` 1.004e+01 4.257e+00
## `prdl.descr.my.fctriPadAir2#0` -4.876e-02 7.330e-01
## `prdl.descr.my.fctriPadAir2#1` 1.258e+01 4.373e+00
## `prdl.descr.my.fctriPadmini#0` -7.839e-01 5.779e-01
## `prdl.descr.my.fctriPadmini#1` 9.592e+00 4.208e+00
## `prdl.descr.my.fctriPadmini2#0` -8.858e-02 6.539e-01
## `prdl.descr.my.fctriPadmini2#1` 1.000e+01 4.408e+00
## `prdl.descr.my.fctriPadmini3#0` -8.260e-01 7.478e-01
## `prdl.descr.my.fctriPadmini3#1` -1.900e+00 1.592e+03
## `spdiff.cut.fctr(-100,-10]` 3.389e+00 7.333e-01
## `spdiff.cut.fctr(-10,-1]` 5.605e+00 8.087e-01
## `spdiff.cut.fctr(-1,0]` 6.771e+00 1.086e+00
## `spdiff.cut.fctr(0,1]` 6.676e+00 1.319e+00
## `spdiff.cut.fctr(1,10]` 5.430e+00 8.168e-01
## `spdiff.cut.fctr(10,100]` 4.741e+00 8.005e-01
## `spdiff.cut.fctr(100,1e+03]` 7.278e-01 1.476e+00
## sprice.d20nexp 4.144e-01 5.511e+00
## sprice.log10 -1.383e+00 1.911e+00
## sprice.root2 1.064e-01 2.597e-01
## startprice.dcm1.is9 -7.246e-01 3.476e-01
## startprice.dcm2.is9 1.270e-01 3.473e-01
## startprice.dgt1.is9 -4.072e-02 2.447e-01
## storage.fctr16 -3.269e-01 7.096e-01
## storage.fctr32 -6.588e-01 7.175e-01
## storage.fctr64 1.519e-01 6.739e-01
## storage.fctrUnknown 5.745e-02 8.609e-01
## `cellular.fctr0:carrier.fctrNone` NA NA
## `cellular.fctr1:carrier.fctrNone` NA NA
## `cellular.fctrUnknown:carrier.fctrNone` NA NA
## `cellular.fctr0:carrier.fctrSprint` NA NA
## `cellular.fctr1:carrier.fctrSprint` -8.460e-02 6.707e-01
## `cellular.fctrUnknown:carrier.fctrSprint` NA NA
## `cellular.fctr0:carrier.fctrT-Mobile` NA NA
## `cellular.fctr1:carrier.fctrT-Mobile` 9.967e-02 9.088e-01
## `cellular.fctrUnknown:carrier.fctrT-Mobile` NA NA
## `cellular.fctr0:carrier.fctrUnknown` NA NA
## `cellular.fctr1:carrier.fctrUnknown` -1.014e+00 6.058e-01
## `cellular.fctrUnknown:carrier.fctrUnknown` NA NA
## `cellular.fctr0:carrier.fctrVerizon` NA NA
## `cellular.fctr1:carrier.fctrVerizon` 2.205e-01 4.454e-01
## `cellular.fctrUnknown:carrier.fctrVerizon` NA NA
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr2` -1.402e+01 4.823e+02
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr2` 6.719e-01 1.120e+00
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr2` -8.161e-01 7.670e-01
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr2` 1.169e+00 9.396e-01
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr2` -3.516e+00 1.312e+00
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr2` -1.801e+00 1.775e+00
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr2` 4.391e-01 1.535e+00
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr2` 1.450e+00 1.064e+00
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr2` 1.583e-01 1.674e+00
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr2` -1.183e+00 2.057e+03
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr3` 3.599e+00 1.140e+00
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr3` 2.605e+00 9.620e-01
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr3` -2.584e-01 1.293e+00
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr3` 4.814e-01 1.206e+00
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr3` -2.013e-01 1.261e+00
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr3` 2.149e+00 1.694e+00
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr3` 2.903e+00 1.451e+00
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr3` 1.688e+00 2.011e+00
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr3` 1.723e+01 1.592e+03
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr4` 1.368e-01 1.733e+00
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr4` 1.397e+00 1.601e+00
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr4` -9.147e-02 1.497e+00
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr4` -1.394e+01 1.695e+03
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr4` NA NA
## z value Pr(>|z|)
## (Intercept) -0.117 0.90649
## D.chrs.n.log -2.339 0.01935 *
## D.chrs.pnct11.n.log 0.427 0.66968
## D.chrs.pnct13.n.log -0.506 0.61286
## D.chrs.uppr.n.log 1.463 0.14344
## D.ratio.weight.sum.wrds.n -1.462 0.14361
## D.ratio.wrds.stop.n.wrds.n -0.279 0.78024
## D.terms.post.stem.n.log 0.640 0.52200
## D.terms.post.stop.n.log -0.953 0.34075
## D.weight.post.stem.sum -0.399 0.69005
## D.weight.post.stop.sum 0.412 0.68022
## D.weight.sum NA NA
## D.weight.sum.stem.stop.Ratio 0.238 0.81178
## D.wrds.n.log 2.484 0.01301 *
## D.wrds.stop.n.log 0.192 0.84748
## D.wrds.unq.n.log NA NA
## cellular.fctr1 1.259 0.20804
## cellular.fctrUnknown -0.972 0.33123
## color.fctrGold -0.560 0.57535
## `color.fctrSpace Gray` 0.045 0.96422
## color.fctrUnknown -0.006 0.99548
## color.fctrWhite 0.148 0.88253
## `condition.fctrFor parts or not working` -0.297 0.76681
## `condition.fctrManufacturer refurbished` 0.185 0.85287
## condition.fctrNew 0.072 0.94259
## `condition.fctrNew other (see details)` 2.146 0.03189 *
## `condition.fctrSeller refurbished` -1.431 0.15239
## `prdl.descr.my.fctrUnknown#1` 2.410 0.01596 *
## `prdl.descr.my.fctriPad1#0` -1.992 0.04633 *
## `prdl.descr.my.fctriPad1#1` 1.959 0.05008 .
## `prdl.descr.my.fctriPad2#0` -0.832 0.40514
## `prdl.descr.my.fctriPad2#1` 2.297 0.02162 *
## `prdl.descr.my.fctriPad3#0` -1.352 0.17639
## `prdl.descr.my.fctriPad3#1` 2.398 0.01650 *
## `prdl.descr.my.fctriPad4#0` 0.076 0.93953
## `prdl.descr.my.fctriPad4#1` 2.902 0.00371 **
## `prdl.descr.my.fctriPadAir#0` 1.097 0.27283
## `prdl.descr.my.fctriPadAir#1` 2.358 0.01835 *
## `prdl.descr.my.fctriPadAir2#0` -0.067 0.94696
## `prdl.descr.my.fctriPadAir2#1` 2.878 0.00401 **
## `prdl.descr.my.fctriPadmini#0` -1.357 0.17492
## `prdl.descr.my.fctriPadmini#1` 2.280 0.02262 *
## `prdl.descr.my.fctriPadmini2#0` -0.135 0.89225
## `prdl.descr.my.fctriPadmini2#1` 2.269 0.02329 *
## `prdl.descr.my.fctriPadmini3#0` -1.105 0.26936
## `prdl.descr.my.fctriPadmini3#1` -0.001 0.99905
## `spdiff.cut.fctr(-100,-10]` 4.622 3.80e-06 ***
## `spdiff.cut.fctr(-10,-1]` 6.931 4.19e-12 ***
## `spdiff.cut.fctr(-1,0]` 6.236 4.50e-10 ***
## `spdiff.cut.fctr(0,1]` 5.063 4.14e-07 ***
## `spdiff.cut.fctr(1,10]` 6.648 2.97e-11 ***
## `spdiff.cut.fctr(10,100]` 5.923 3.16e-09 ***
## `spdiff.cut.fctr(100,1e+03]` 0.493 0.62207
## sprice.d20nexp 0.075 0.94006
## sprice.log10 -0.724 0.46923
## sprice.root2 0.410 0.68191
## startprice.dcm1.is9 -2.085 0.03711 *
## startprice.dcm2.is9 0.366 0.71467
## startprice.dgt1.is9 -0.166 0.86783
## storage.fctr16 -0.461 0.64499
## storage.fctr32 -0.918 0.35846
## storage.fctr64 0.225 0.82169
## storage.fctrUnknown 0.067 0.94679
## `cellular.fctr0:carrier.fctrNone` NA NA
## `cellular.fctr1:carrier.fctrNone` NA NA
## `cellular.fctrUnknown:carrier.fctrNone` NA NA
## `cellular.fctr0:carrier.fctrSprint` NA NA
## `cellular.fctr1:carrier.fctrSprint` -0.126 0.89963
## `cellular.fctrUnknown:carrier.fctrSprint` NA NA
## `cellular.fctr0:carrier.fctrT-Mobile` NA NA
## `cellular.fctr1:carrier.fctrT-Mobile` 0.110 0.91267
## `cellular.fctrUnknown:carrier.fctrT-Mobile` NA NA
## `cellular.fctr0:carrier.fctrUnknown` NA NA
## `cellular.fctr1:carrier.fctrUnknown` -1.674 0.09421 .
## `cellular.fctrUnknown:carrier.fctrUnknown` NA NA
## `cellular.fctr0:carrier.fctrVerizon` NA NA
## `cellular.fctr1:carrier.fctrVerizon` 0.495 0.62053
## `cellular.fctrUnknown:carrier.fctrVerizon` NA NA
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr2` -0.029 0.97681
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr2` 0.600 0.54841
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr2` -1.064 0.28730
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr2` 1.245 0.21330
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr2` -2.679 0.00738 **
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr2` -1.015 0.31034
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr2` 0.286 0.77481
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr2` 1.362 0.17311
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr2` 0.095 0.92466
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr2` NA NA
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr2` -0.001 0.99954
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr3` 3.157 0.00160 **
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr3` 2.708 0.00678 **
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr3` -0.200 0.84157
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr3` 0.399 0.68979
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr3` -0.160 0.87312
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr3` 1.269 0.20461
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr3` 2.001 0.04534 *
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr3` 0.840 0.40116
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr3` NA NA
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr3` 0.011 0.99137
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr4` 0.079 0.93709
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr4` 0.873 0.38291
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr4` -0.061 0.95126
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr4` -0.008 0.99344
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr4` NA NA
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr4` NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1053.45 on 1014 degrees of freedom
## Residual deviance: 737.29 on 927 degrees of freedom
## AIC: 913.29
##
## Number of Fisher Scoring iterations: 15
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## sold.fctr sold.fctr.predict.RFE.X.Train.glm.N
## 1 N 661
## 2 Y 65
## sold.fctr.predict.RFE.X.Train.glm.Y
## 1 137
## 2 152
## Prediction
## Reference N Y
## N 661 137
## Y 65 152
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.009852e-01 4.717954e-01 7.750724e-01 8.251388e-01 7.862069e-01
## AccuracyPValue McnemarPValue
## 1.330771e-01 5.867058e-07
## id
## 1 RFE.X.Train.glm
## feats
## 1 spdiff.cut.fctr,sprice.log10,sprice.d20nexp,sprice.root2,D.terms.post.stop.n.log,condition.fctr,D.wrds.unq.n.log,D.terms.post.stem.n.log,prdl.descr.my.fctr,D.ratio.wrds.stop.n.wrds.n,D.chrs.n.log,D.chrs.uppr.n.log,D.ratio.weight.sum.wrds.n,startprice.dcm1.is9,D.weight.post.stop.sum,D.weight.sum,D.wrds.n.log,D.weight.post.stem.sum,cellular.fctr,storage.fctr,D.weight.sum.stem.stop.Ratio,startprice.dcm2.is9,color.fctr,D.wrds.stop.n.log,D.chrs.pnct13.n.log,D.chrs.pnct11.n.log,startprice.dgt1.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 4.393 0.33
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.6860902 0.943609 0.4285714 0.8549917
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.3 0.6007905 0.7727557
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7750724 0.8251388 0.2365187
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.01973687 0.03760659
## Warning in glb_get_predictions(df = glb_trnobs_df, mdl_id =
## tail(glb_models_df$id, : Using default probability threshold: 0.3
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in glb_get_predictions(df = glb_newobs_df, mdl_id =
## tail(glb_models_df$id, : Using default probability threshold: 0.3
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## [1] "fitting model: RFE.X.Train.glmnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.log10,sprice.d20nexp,sprice.root2,D.terms.post.stop.n.log,condition.fctr,D.wrds.unq.n.log,D.terms.post.stem.n.log,prdl.descr.my.fctr,D.ratio.wrds.stop.n.wrds.n,D.chrs.n.log,D.chrs.uppr.n.log,D.ratio.weight.sum.wrds.n,startprice.dcm1.is9,D.weight.post.stop.sum,D.weight.sum,D.wrds.n.log,D.weight.post.stem.sum,cellular.fctr,storage.fctr,D.weight.sum.stem.stop.Ratio,startprice.dcm2.is9,color.fctr,D.wrds.stop.n.log,D.chrs.pnct13.n.log,D.chrs.pnct11.n.log,startprice.dgt1.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0.55, lambda = 0.038 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdlIdPfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: lambda
## Length Class Mode
## a0 79 -none- numeric
## beta 10823 dgCMatrix S4
## df 79 -none- numeric
## dim 2 -none- numeric
## lambda 79 -none- numeric
## dev.ratio 79 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 137 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -1.0596521315
## D.terms.post.stem.n.log
## -0.0004105044
## D.terms.post.stop.n.log
## -0.0087471800
## spdiff.cut.fctr(-10,-1]
## 1.2703640381
## spdiff.cut.fctr(-1,0]
## 1.8122956632
## spdiff.cut.fctr(0,1]
## 0.5582644539
## spdiff.cut.fctr(1,10]
## 0.8651457925
## spdiff.cut.fctr(10,100]
## 0.5795734919
## spdiff.cut.fctr(100,1e+03]
## -0.0586463571
## sprice.log10
## -0.0913633933
## sprice.root2
## -0.0082192714
## startprice.dcm1.is9
## -0.0093946406
## cellular.fctr1:carrier.fctrUnknown
## -0.0094534047
## prdl.descr.my.fctrUnknown#1:.clusterid.fctr2
## -0.0841166253
## prdl.descr.my.fctriPad2#1:.clusterid.fctr2
## -0.0238836868
## prdl.descr.my.fctrUnknown#1:.clusterid.fctr3
## 0.8683715086
## prdl.descr.my.fctriPad1#1:.clusterid.fctr3
## 0.4518331429
## prdl.descr.my.fctriPadmini#1:.clusterid.fctr3
## 0.4060409738
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -0.9810497433
## D.terms.post.stem.n.log
## -0.0056081805
## D.terms.post.stop.n.log
## -0.0148047319
## D.wrds.unq.n.log
## -0.0005308545
## spdiff.cut.fctr(-10,-1]
## 1.3220567042
## spdiff.cut.fctr(-1,0]
## 1.9033912222
## spdiff.cut.fctr(0,1]
## 0.6990621689
## spdiff.cut.fctr(1,10]
## 0.9150545354
## spdiff.cut.fctr(10,100]
## 0.6247100464
## spdiff.cut.fctr(100,1e+03]
## -0.1004955151
## sprice.log10
## -0.1077594222
## sprice.root2
## -0.0075469078
## startprice.dcm1.is9
## -0.0346483180
## cellular.fctr1:carrier.fctrUnknown
## -0.0474878917
## prdl.descr.my.fctrUnknown#1:.clusterid.fctr2
## -0.1514878934
## prdl.descr.my.fctriPad2#1:.clusterid.fctr2
## -0.0864699071
## prdl.descr.my.fctriPad4#1:.clusterid.fctr2
## -0.0148956069
## prdl.descr.my.fctrUnknown#1:.clusterid.fctr3
## 0.9839857227
## prdl.descr.my.fctriPad1#1:.clusterid.fctr3
## 0.5081241794
## prdl.descr.my.fctriPadmini#1:.clusterid.fctr3
## 0.5211332496
## sold.fctr sold.fctr.predict.RFE.X.Train.glmnet.N
## 1 N 550
## 2 Y 60
## sold.fctr.predict.RFE.X.Train.glmnet.Y
## 1 248
## 2 157
## Prediction
## Reference N Y
## N 550 248
## Y 60 157
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 6.965517e-01 3.137677e-01 6.672349e-01 7.247236e-01 7.862069e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 1.646881e-26
## id
## 1 RFE.X.Train.glmnet
## feats
## 1 spdiff.cut.fctr,sprice.log10,sprice.d20nexp,sprice.root2,D.terms.post.stop.n.log,condition.fctr,D.wrds.unq.n.log,D.terms.post.stem.n.log,prdl.descr.my.fctr,D.ratio.wrds.stop.n.wrds.n,D.chrs.n.log,D.chrs.uppr.n.log,D.ratio.weight.sum.wrds.n,startprice.dcm1.is9,D.weight.post.stop.sum,D.weight.sum,D.wrds.n.log,D.weight.post.stem.sum,cellular.fctr,storage.fctr,D.weight.sum.stem.stop.Ratio,startprice.dcm2.is9,color.fctr,D.wrds.stop.n.log,D.chrs.pnct13.n.log,D.chrs.pnct11.n.log,startprice.dgt1.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 20.758 1.635
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5389684 0.9949875 0.08294931 0.7546747
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.5048232 0.7944209
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.6672349 0.7247236 0.08833229
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.00865625 0.04407723
## Warning in glb_get_predictions(df = glb_trnobs_df, mdl_id =
## tail(glb_models_df$id, : Using default probability threshold: 0.2
## Warning in glb_get_predictions(df = glb_newobs_df, mdl_id =
## tail(glb_models_df$id, : Using default probability threshold: 0.2
## [1] "fitting model: RFE.X.Train.nnet"
## [1] " indep_vars: spdiff.cut.fctr,sprice.log10,sprice.d20nexp,sprice.root2,D.terms.post.stop.n.log,condition.fctr,D.wrds.unq.n.log,D.terms.post.stem.n.log,prdl.descr.my.fctr,D.ratio.wrds.stop.n.wrds.n,D.chrs.n.log,D.chrs.uppr.n.log,D.ratio.weight.sum.wrds.n,startprice.dcm1.is9,D.weight.post.stop.sum,D.weight.sum,D.wrds.n.log,D.weight.post.stem.sum,cellular.fctr,storage.fctr,D.weight.sum.stem.stop.Ratio,startprice.dcm2.is9,color.fctr,D.wrds.stop.n.log,D.chrs.pnct13.n.log,D.chrs.pnct11.n.log,startprice.dgt1.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
## Aggregating results
## Warning in train.default(x, y, weights = w, ...): missing values found in
## aggregated results
## Selecting tuning parameters
## Fitting size = 1, decay = 0.001 on full training set
## # weights: 140
## initial value 539.716187
## iter 10 value 480.233007
## iter 20 value 412.880806
## iter 30 value 389.822722
## iter 40 value 383.723124
## iter 50 value 382.923152
## iter 60 value 382.676747
## iter 70 value 382.570502
## iter 80 value 382.518867
## iter 90 value 382.502277
## iter 100 value 382.496270
## final value 382.496270
## stopped after 100 iterations
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdlIdPfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: size
## a 137-1-1 network with 140 weights
## options were - entropy fitting decay=0.001
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1
## 4.12 7.92 5.47 3.84 -5.10 3.21 11.99 4.44
## i8->h1 i9->h1 i10->h1 i11->h1 i12->h1 i13->h1 i14->h1 i15->h1
## -0.70 1.01 -5.18 1.03 2.48 1.21 -6.69 4.47
## i16->h1 i17->h1 i18->h1 i19->h1 i20->h1 i21->h1 i22->h1 i23->h1
## -2.59 3.63 11.49 -0.40 -2.82 -2.83 5.07 1.38
## i24->h1 i25->h1 i26->h1 i27->h1 i28->h1 i29->h1 i30->h1 i31->h1
## -0.22 2.72 3.60 -0.43 15.19 8.21 14.02 10.65
## i32->h1 i33->h1 i34->h1 i35->h1 i36->h1 i37->h1 i38->h1 i39->h1
## 15.34 2.31 3.54 -11.36 -1.40 0.24 -6.99 0.24
## i40->h1 i41->h1 i42->h1 i43->h1 i44->h1 i45->h1 i46->h1 i47->h1
## 9.72 1.89 1.46 1.06 0.92 -11.21 -9.43 -26.44
## i48->h1 i49->h1 i50->h1 i51->h1 i52->h1 i53->h1 i54->h1 i55->h1
## -32.59 -14.11 -35.50 -14.64 14.35 -1.77 -11.82 3.21
## i56->h1 i57->h1 i58->h1 i59->h1 i60->h1 i61->h1 i62->h1 i63->h1
## 0.97 7.45 -3.42 1.41 3.42 0.39 -3.53 3.14
## i64->h1 i65->h1 i66->h1 i67->h1 i68->h1 i69->h1 i70->h1 i71->h1
## 0.04 0.00 0.01 -6.29 -0.02 -0.02 -0.90 0.02
## i72->h1 i73->h1 i74->h1 i75->h1 i76->h1 i77->h1 i78->h1 i79->h1
## 0.04 3.20 3.55 0.02 -0.06 -0.04 -0.01 2.49
## i80->h1 i81->h1 i82->h1 i83->h1 i84->h1 i85->h1 i86->h1 i87->h1
## -0.01 -2.81 0.01 -0.77 0.02 -8.80 0.02 25.30
## i88->h1 i89->h1 i90->h1 i91->h1 i92->h1 i93->h1 i94->h1 i95->h1
## -0.02 1.10 0.03 0.05 -0.02 -9.26 -0.04 -5.71
## i96->h1 i97->h1 i98->h1 i99->h1 i100->h1 i101->h1 i102->h1 i103->h1
## 0.04 1.00 -0.02 -11.90 -0.03 -10.12 0.00 0.89
## i104->h1 i105->h1 i106->h1 i107->h1 i108->h1 i109->h1 i110->h1 i111->h1
## 0.01 -1.18 -0.02 0.61 0.01 0.04 0.00 -0.04
## i112->h1 i113->h1 i114->h1 i115->h1 i116->h1 i117->h1 i118->h1 i119->h1
## 0.03 -17.73 0.03 1.28 -0.02 -12.26 0.02 -0.37
## i120->h1 i121->h1 i122->h1 i123->h1 i124->h1 i125->h1 i126->h1 i127->h1
## 0.02 0.04 0.01 0.00 -0.04 0.00 0.01 0.00
## i128->h1 i129->h1 i130->h1 i131->h1 i132->h1 i133->h1 i134->h1 i135->h1
## 0.04 -15.95 0.04 0.02 -0.02 -4.43 -0.02 0.63
## i136->h1 i137->h1
## 0.02 0.01
## b->o h1->o
## 0.32 -3.21
## sold.fctr sold.fctr.predict.RFE.X.Train.nnet.N
## 1 N 666
## 2 Y 42
## sold.fctr.predict.RFE.X.Train.nnet.Y
## 1 132
## 2 175
## Prediction
## Reference N Y
## N 666 132
## Y 42 175
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 8.285714e-01 5.569487e-01 8.039492e-01 8.512704e-01 7.862069e-01
## AccuracyPValue McnemarPValue
## 4.316532e-04 1.508578e-11
## id
## 1 RFE.X.Train.nnet
## feats
## 1 spdiff.cut.fctr,sprice.log10,sprice.d20nexp,sprice.root2,D.terms.post.stop.n.log,condition.fctr,D.wrds.unq.n.log,D.terms.post.stem.n.log,prdl.descr.my.fctr,D.ratio.wrds.stop.n.wrds.n,D.chrs.n.log,D.chrs.uppr.n.log,D.ratio.weight.sum.wrds.n,startprice.dcm1.is9,D.weight.post.stop.sum,D.weight.sum,D.wrds.n.log,D.weight.post.stem.sum,cellular.fctr,storage.fctr,D.weight.sum.stem.stop.Ratio,startprice.dcm2.is9,color.fctr,D.wrds.stop.n.log,D.chrs.pnct13.n.log,D.chrs.pnct11.n.log,startprice.dgt1.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 34.394 0.272
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.7773466 0.8496241 0.7050691 0.799672
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.3 0.6679389 0.7839073
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.8039492 0.8512704 0.06347849
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.005645114 0.1235106
## Warning in glb_get_predictions(df = glb_trnobs_df, mdl_id =
## tail(glb_models_df$id, : Using default probability threshold: 0
## Warning in glb_get_predictions(df = glb_newobs_df, mdl_id =
## tail(glb_models_df$id, : Using default probability threshold: 0
## [1] "fitting model: RFE.X.Train.rpart"
## [1] " indep_vars: spdiff.cut.fctr,sprice.log10,sprice.d20nexp,sprice.root2,D.terms.post.stop.n.log,condition.fctr,D.wrds.unq.n.log,D.terms.post.stem.n.log,prdl.descr.my.fctr,D.ratio.wrds.stop.n.wrds.n,D.chrs.n.log,D.chrs.uppr.n.log,D.ratio.weight.sum.wrds.n,startprice.dcm1.is9,D.weight.post.stop.sum,D.weight.sum,D.wrds.n.log,D.weight.post.stem.sum,cellular.fctr,storage.fctr,D.weight.sum.stem.stop.Ratio,startprice.dcm2.is9,color.fctr,D.wrds.stop.n.log,D.chrs.pnct13.n.log,D.chrs.pnct11.n.log,startprice.dgt1.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## + Fold1.Rep1: cp=0.004608
## - Fold1.Rep1: cp=0.004608
## + Fold2.Rep1: cp=0.004608
## - Fold2.Rep1: cp=0.004608
## + Fold3.Rep1: cp=0.004608
## - Fold3.Rep1: cp=0.004608
## + Fold1.Rep2: cp=0.004608
## - Fold1.Rep2: cp=0.004608
## + Fold2.Rep2: cp=0.004608
## - Fold2.Rep2: cp=0.004608
## + Fold3.Rep2: cp=0.004608
## - Fold3.Rep2: cp=0.004608
## + Fold1.Rep3: cp=0.004608
## - Fold1.Rep3: cp=0.004608
## + Fold2.Rep3: cp=0.004608
## - Fold2.Rep3: cp=0.004608
## + Fold3.Rep3: cp=0.004608
## - Fold3.Rep3: cp=0.004608
## Aggregating results
## Selecting tuning parameters
## Fitting cp = 0.0207 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdlIdPfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: cp
## Call:
## rpart(formula = .outcome ~ ., control = list(minsplit = 20, minbucket = 7,
## cp = 0, maxcompete = 4, maxsurrogate = 5, usesurrogate = 2,
## surrogatestyle = 0, maxdepth = 30, xval = 0))
## n= 1015
##
## CP nsplit rel error
## 1 0.02073733 0 1
##
## Node number 1: 1015 observations
## predicted class=N expected loss=0.2137931 P(node) =1
## class counts: 798 217
## probabilities: 0.786 0.214
##
## n= 1015
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 1015 217 N (0.7862069 0.2137931) *
## sold.fctr sold.fctr.predict.RFE.X.Train.rpart.Y
## 1 N 798
## 2 Y 217
## Prediction
## Reference N Y
## N 0 798
## Y 0 217
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 2.137931e-01 0.000000e+00 1.889379e-01 2.403200e-01 7.862069e-01
## AccuracyPValue McnemarPValue
## 1.000000e+00 3.994509e-175
## id
## 1 RFE.X.Train.rpart
## feats
## 1 spdiff.cut.fctr,sprice.log10,sprice.d20nexp,sprice.root2,D.terms.post.stop.n.log,condition.fctr,D.wrds.unq.n.log,D.terms.post.stem.n.log,prdl.descr.my.fctr,D.ratio.wrds.stop.n.wrds.n,D.chrs.n.log,D.chrs.uppr.n.log,D.ratio.weight.sum.wrds.n,startprice.dcm1.is9,D.weight.post.stop.sum,D.weight.sum,D.wrds.n.log,D.weight.post.stem.sum,cellular.fctr,storage.fctr,D.weight.sum.stem.stop.Ratio,startprice.dcm2.is9,color.fctr,D.wrds.stop.n.log,D.chrs.pnct13.n.log,D.chrs.pnct11.n.log,startprice.dgt1.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 5 4.077 0.107
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.5 1 0 0.5
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.2 0.3522727 0.7822607
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.1889379 0.24032 0.0808607
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.0105674 0.08277784
## Warning in glb_get_predictions(df = glb_trnobs_df, mdl_id =
## tail(glb_models_df$id, : Using default probability threshold: 0.2
## Warning in glb_get_predictions(df = glb_newobs_df, mdl_id =
## tail(glb_models_df$id, : Using default probability threshold: 0.2
## [1] "fitting model: RFE.X.Train.earth"
## [1] " indep_vars: spdiff.cut.fctr,sprice.log10,sprice.d20nexp,sprice.root2,D.terms.post.stop.n.log,condition.fctr,D.wrds.unq.n.log,D.terms.post.stem.n.log,prdl.descr.my.fctr,D.ratio.wrds.stop.n.wrds.n,D.chrs.n.log,D.chrs.uppr.n.log,D.ratio.weight.sum.wrds.n,startprice.dcm1.is9,D.weight.post.stop.sum,D.weight.sum,D.wrds.n.log,D.weight.post.stem.sum,cellular.fctr,storage.fctr,D.weight.sum.stem.stop.Ratio,startprice.dcm2.is9,color.fctr,D.wrds.stop.n.log,D.chrs.pnct13.n.log,D.chrs.pnct11.n.log,startprice.dgt1.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## Aggregating results
## Selecting tuning parameters
## Fitting nprune = 19, degree = 1 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst
## = list(id.prefix = mdlIdPfx, : model's bestTune found at an extreme of
## tuneGrid for parameter: degree
## Call: earth(x=matrix[1015,137], y=factor.object, keepxy=TRUE,
## glm=list(family=function.object), degree=1, nprune=19)
##
## GLM coefficients
## Y
## (Intercept) 2.624254
## spdiff.cut.fctr(-100,-10] 2.571327
## spdiff.cut.fctr(-10,-1] 4.229209
## spdiff.cut.fctr(-1,0] 5.393788
## spdiff.cut.fctr(0,1] 4.377535
## spdiff.cut.fctr(1,10] 3.745198
## spdiff.cut.fctr(10,100] 3.461995
## startprice.dcm1.is9 -0.477689
## h(D.chrs.n.log-3.3673) -5.543306
## h(0.584895-D.ratio.weight.sum.wrds.n) -4.166911
## h(D.ratio.weight.sum.wrds.n-0.903556) -20.447099
## h(D.ratio.weight.sum.wrds.n-1.04921) 20.627942
## h(D.terms.post.stop.n.log-2.89037) 34.630151
## h(D.wrds.n.log-1.94591) 5.469577
## h(0.157631-sprice.d20nexp) -27.256409
## h(sprice.d20nexp-0.157631) 84.203265
## h(6.07865-sprice.root2) -13.558137
##
## Earth selected 17 of 94 terms, and 13 of 137 predictors
## Termination condition: RSq changed by less than 0.001 at 94 terms
## Importance: spdiff.cut.fctr(-10,-1], spdiff.cut.fctr(1,10], ...
## Number of terms at each degree of interaction: 1 16 (additive model)
## Earth GCV 0.1393652 RSS 132.4071 GRSq 0.1725009 RSq 0.2239056
##
## GLM null.deviance 1053.446 (1014 dof) deviance 808.4047 (998 dof) iters 6
## sold.fctr sold.fctr.predict.RFE.X.Train.earth.N
## 1 N 653
## 2 Y 75
## sold.fctr.predict.RFE.X.Train.earth.Y
## 1 145
## 2 142
## Prediction
## Reference N Y
## N 653 145
## Y 75 142
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.832512e-01 4.230004e-01 7.566075e-01 8.082408e-01 7.862069e-01
## AccuracyPValue McnemarPValue
## 6.082313e-01 3.287645e-06
## id
## 1 RFE.X.Train.earth
## feats
## 1 spdiff.cut.fctr,sprice.log10,sprice.d20nexp,sprice.root2,D.terms.post.stop.n.log,condition.fctr,D.wrds.unq.n.log,D.terms.post.stem.n.log,prdl.descr.my.fctr,D.ratio.wrds.stop.n.wrds.n,D.chrs.n.log,D.chrs.uppr.n.log,D.ratio.weight.sum.wrds.n,startprice.dcm1.is9,D.weight.post.stop.sum,D.weight.sum,D.wrds.n.log,D.weight.post.stem.sum,cellular.fctr,storage.fctr,D.weight.sum.stem.stop.Ratio,startprice.dcm2.is9,color.fctr,D.wrds.stop.n.log,D.chrs.pnct13.n.log,D.chrs.pnct11.n.log,startprice.dgt1.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 5 14.347 2.302
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.6247069 0.9498747 0.2995392 0.8157173
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.3 0.5634921 0.7848935
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7566075 0.8082408 0.2085149
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.01359804 0.07898735
## Warning in glb_get_predictions(df = glb_trnobs_df, mdl_id =
## tail(glb_models_df$id, : Using default probability threshold: 0.1
## Warning in glb_get_predictions(df = glb_newobs_df, mdl_id =
## tail(glb_models_df$id, : Using default probability threshold: 0.1
## [1] "fitting model: RFE.X.Train.bayesglm"
## [1] " indep_vars: spdiff.cut.fctr,sprice.log10,sprice.d20nexp,sprice.root2,D.terms.post.stop.n.log,condition.fctr,D.wrds.unq.n.log,D.terms.post.stem.n.log,prdl.descr.my.fctr,D.ratio.wrds.stop.n.wrds.n,D.chrs.n.log,D.chrs.uppr.n.log,D.ratio.weight.sum.wrds.n,startprice.dcm1.is9,D.weight.post.stop.sum,D.weight.sum,D.wrds.n.log,D.weight.post.stem.sum,cellular.fctr,storage.fctr,D.weight.sum.stem.stop.Ratio,startprice.dcm2.is9,color.fctr,D.wrds.stop.n.log,D.chrs.pnct13.n.log,D.chrs.pnct11.n.log,startprice.dgt1.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr"
## + Fold1.Rep1: parameter=none
## - Fold1.Rep1: parameter=none
## + Fold2.Rep1: parameter=none
## - Fold2.Rep1: parameter=none
## + Fold3.Rep1: parameter=none
## - Fold3.Rep1: parameter=none
## + Fold1.Rep2: parameter=none
## - Fold1.Rep2: parameter=none
## + Fold2.Rep2: parameter=none
## - Fold2.Rep2: parameter=none
## + Fold3.Rep2: parameter=none
## - Fold3.Rep2: parameter=none
## + Fold1.Rep3: parameter=none
## - Fold1.Rep3: parameter=none
## + Fold2.Rep3: parameter=none
## - Fold2.Rep3: parameter=none
## + Fold3.Rep3: parameter=none
## - Fold3.Rep3: parameter=none
## Aggregating results
## Fitting final model on full training set
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7696 -0.6461 -0.3665 -0.1067 2.8881
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 4.077154 5.650298
## D.chrs.n.log -0.163366 0.455181
## D.chrs.pnct11.n.log -0.199225 0.349474
## D.chrs.pnct13.n.log -0.442355 0.322885
## D.chrs.uppr.n.log -0.152505 0.463869
## D.ratio.weight.sum.wrds.n -0.168131 0.270111
## D.ratio.wrds.stop.n.wrds.n -2.019639 2.004414
## D.terms.post.stem.n.log -0.329831 0.844374
## D.terms.post.stop.n.log -0.531801 0.873388
## D.weight.post.stem.sum 0.037764 0.384753
## D.weight.post.stop.sum 0.054213 0.368900
## D.weight.sum 0.037764 0.384753
## D.weight.sum.stem.stop.Ratio -2.657298 4.050153
## D.wrds.n.log 0.344644 0.753090
## D.wrds.stop.n.log 0.452279 0.446345
## D.wrds.unq.n.log -0.329831 0.844374
## cellular.fctr1 0.268422 1.289156
## cellular.fctrUnknown -0.245586 1.642390
## color.fctrGold -0.195280 0.569080
## `color.fctrSpace Gray` 0.046485 0.379601
## color.fctrUnknown 0.083617 0.249200
## color.fctrWhite 0.031881 0.285685
## `condition.fctrFor parts or not working` -0.023756 0.436532
## `condition.fctrManufacturer refurbished` -0.009887 0.561852
## condition.fctrNew -0.039580 0.346036
## `condition.fctrNew other (see details)` 0.996091 0.440939
## `condition.fctrSeller refurbished` -0.741721 0.399996
## `prdl.descr.my.fctrUnknown#1` 0.319163 0.848135
## `prdl.descr.my.fctriPad1#0` -1.205641 0.595207
## `prdl.descr.my.fctriPad1#1` -1.051389 0.842542
## `prdl.descr.my.fctriPad2#0` -0.433979 0.526274
## `prdl.descr.my.fctriPad2#1` -0.210557 0.723514
## `prdl.descr.my.fctriPad3#0` -0.981572 0.727133
## `prdl.descr.my.fctriPad3#1` 0.147003 0.749325
## `prdl.descr.my.fctriPad4#0` 0.077133 0.552420
## `prdl.descr.my.fctriPad4#1` 1.601372 0.801494
## `prdl.descr.my.fctriPadAir#0` 0.568434 0.542259
## `prdl.descr.my.fctriPadAir#1` 0.231427 0.912149
## `prdl.descr.my.fctriPadAir2#0` 0.042705 0.572674
## `prdl.descr.my.fctriPadAir2#1` 1.891376 1.084977
## `prdl.descr.my.fctriPadmini#0` -0.680161 0.465593
## `prdl.descr.my.fctriPadmini#1` -0.008474 0.788026
## `prdl.descr.my.fctriPadmini2#0` -0.050312 0.536402
## `prdl.descr.my.fctriPadmini2#1` 0.423126 1.001118
## `prdl.descr.my.fctriPadmini3#0` -0.640003 0.616474
## `prdl.descr.my.fctriPadmini3#1` 0.622012 1.561426
## `spdiff.cut.fctr(-100,-10]` 2.346146 0.498210
## `spdiff.cut.fctr(-10,-1]` 4.326179 0.562811
## `spdiff.cut.fctr(-1,0]` 5.400674 0.855848
## `spdiff.cut.fctr(0,1]` 5.250724 1.101774
## `spdiff.cut.fctr(1,10]` 4.114998 0.575150
## `spdiff.cut.fctr(10,100]` 3.493423 0.549736
## `spdiff.cut.fctr(100,1e+03]` -0.126305 1.042879
## sprice.d20nexp 2.384901 3.053755
## sprice.log10 -0.519274 0.881027
## sprice.root2 -0.001687 0.119363
## startprice.dcm1.is9 -0.617753 0.317700
## startprice.dcm2.is9 0.115421 0.315348
## startprice.dgt1.is9 0.008534 0.228759
## storage.fctr16 -0.237001 0.522636
## storage.fctr32 -0.551036 0.541493
## storage.fctr64 0.204369 0.515505
## storage.fctrUnknown 0.107833 0.668542
## `cellular.fctr0:carrier.fctrNone` -0.050242 1.285205
## `cellular.fctr1:carrier.fctrNone` 0.000000 2.500000
## `cellular.fctrUnknown:carrier.fctrNone` 0.000000 2.500000
## `cellular.fctr0:carrier.fctrSprint` 0.000000 2.500000
## `cellular.fctr1:carrier.fctrSprint` 0.123564 0.592289
## `cellular.fctrUnknown:carrier.fctrSprint` 0.000000 2.500000
## `cellular.fctr0:carrier.fctrT-Mobile` 0.000000 2.500000
## `cellular.fctr1:carrier.fctrT-Mobile` 0.249869 0.783538
## `cellular.fctrUnknown:carrier.fctrT-Mobile` 0.000000 2.500000
## `cellular.fctr0:carrier.fctrUnknown` 0.000000 2.500000
## `cellular.fctr1:carrier.fctrUnknown` -0.826950 0.538495
## `cellular.fctrUnknown:carrier.fctrUnknown` -0.245586 1.642390
## `cellular.fctr0:carrier.fctrVerizon` 0.000000 2.500000
## `cellular.fctr1:carrier.fctrVerizon` 0.179329 0.400698
## `cellular.fctrUnknown:carrier.fctrVerizon` 0.000000 2.500000
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr2` -1.857606 1.555272
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr2` 0.344762 0.911491
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr2` -0.632122 0.651493
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr2` 0.875445 0.786296
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr2` -2.398563 1.008080
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr2` -0.762392 1.137219
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr2` 0.752879 1.092789
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr2` 1.215487 0.845466
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr2` 0.173707 1.088316
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr2` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr2` -0.494770 1.937658
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr3` 3.238725 0.973840
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr3` 2.008264 0.808834
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr3` -0.665316 0.984395
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr3` 0.656076 0.906450
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr3` -0.126060 1.031659
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr3` 1.315876 1.261265
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr3` 2.354526 1.091408
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr3` 1.214819 1.345509
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr3` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr3` 3.111902 1.961705
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr4` 0.451192 1.218069
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr4` 1.308262 1.151321
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr4` -0.215568 1.128419
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr4` -0.774322 1.741634
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr4` 0.000000 2.500000
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr4` 0.000000 2.500000
## z value Pr(>|z|)
## (Intercept) 0.722 0.470551
## D.chrs.n.log -0.359 0.719667
## D.chrs.pnct11.n.log -0.570 0.568628
## D.chrs.pnct13.n.log -1.370 0.170684
## D.chrs.uppr.n.log -0.329 0.742332
## D.ratio.weight.sum.wrds.n -0.622 0.533646
## D.ratio.wrds.stop.n.wrds.n -1.008 0.313648
## D.terms.post.stem.n.log -0.391 0.696076
## D.terms.post.stop.n.log -0.609 0.542594
## D.weight.post.stem.sum 0.098 0.921813
## D.weight.post.stop.sum 0.147 0.883166
## D.weight.sum 0.098 0.921813
## D.weight.sum.stem.stop.Ratio -0.656 0.511761
## D.wrds.n.log 0.458 0.647211
## D.wrds.stop.n.log 1.013 0.310920
## D.wrds.unq.n.log -0.391 0.696076
## cellular.fctr1 0.208 0.835061
## cellular.fctrUnknown -0.150 0.881136
## color.fctrGold -0.343 0.731485
## `color.fctrSpace Gray` 0.122 0.902537
## color.fctrUnknown 0.336 0.737217
## color.fctrWhite 0.112 0.911145
## `condition.fctrFor parts or not working` -0.054 0.956601
## `condition.fctrManufacturer refurbished` -0.018 0.985961
## condition.fctrNew -0.114 0.908936
## `condition.fctrNew other (see details)` 2.259 0.023882 *
## `condition.fctrSeller refurbished` -1.854 0.063693 .
## `prdl.descr.my.fctrUnknown#1` 0.376 0.706685
## `prdl.descr.my.fctriPad1#0` -2.026 0.042808 *
## `prdl.descr.my.fctriPad1#1` -1.248 0.212076
## `prdl.descr.my.fctriPad2#0` -0.825 0.409584
## `prdl.descr.my.fctriPad2#1` -0.291 0.771036
## `prdl.descr.my.fctriPad3#0` -1.350 0.177041
## `prdl.descr.my.fctriPad3#1` 0.196 0.844469
## `prdl.descr.my.fctriPad4#0` 0.140 0.888954
## `prdl.descr.my.fctriPad4#1` 1.998 0.045718 *
## `prdl.descr.my.fctriPadAir#0` 1.048 0.294514
## `prdl.descr.my.fctriPadAir#1` 0.254 0.799715
## `prdl.descr.my.fctriPadAir2#0` 0.075 0.940555
## `prdl.descr.my.fctriPadAir2#1` 1.743 0.081292 .
## `prdl.descr.my.fctriPadmini#0` -1.461 0.144057
## `prdl.descr.my.fctriPadmini#1` -0.011 0.991420
## `prdl.descr.my.fctriPadmini2#0` -0.094 0.925271
## `prdl.descr.my.fctriPadmini2#1` 0.423 0.672548
## `prdl.descr.my.fctriPadmini3#0` -1.038 0.299193
## `prdl.descr.my.fctriPadmini3#1` 0.398 0.690364
## `spdiff.cut.fctr(-100,-10]` 4.709 2.49e-06 ***
## `spdiff.cut.fctr(-10,-1]` 7.687 1.51e-14 ***
## `spdiff.cut.fctr(-1,0]` 6.310 2.78e-10 ***
## `spdiff.cut.fctr(0,1]` 4.766 1.88e-06 ***
## `spdiff.cut.fctr(1,10]` 7.155 8.39e-13 ***
## `spdiff.cut.fctr(10,100]` 6.355 2.09e-10 ***
## `spdiff.cut.fctr(100,1e+03]` -0.121 0.903603
## sprice.d20nexp 0.781 0.434818
## sprice.log10 -0.589 0.555595
## sprice.root2 -0.014 0.988720
## startprice.dcm1.is9 -1.944 0.051841 .
## startprice.dcm2.is9 0.366 0.714356
## startprice.dgt1.is9 0.037 0.970241
## storage.fctr16 -0.453 0.650208
## storage.fctr32 -1.018 0.308857
## storage.fctr64 0.396 0.691777
## storage.fctrUnknown 0.161 0.871861
## `cellular.fctr0:carrier.fctrNone` -0.039 0.968817
## `cellular.fctr1:carrier.fctrNone` 0.000 1.000000
## `cellular.fctrUnknown:carrier.fctrNone` 0.000 1.000000
## `cellular.fctr0:carrier.fctrSprint` 0.000 1.000000
## `cellular.fctr1:carrier.fctrSprint` 0.209 0.834744
## `cellular.fctrUnknown:carrier.fctrSprint` 0.000 1.000000
## `cellular.fctr0:carrier.fctrT-Mobile` 0.000 1.000000
## `cellular.fctr1:carrier.fctrT-Mobile` 0.319 0.749803
## `cellular.fctrUnknown:carrier.fctrT-Mobile` 0.000 1.000000
## `cellular.fctr0:carrier.fctrUnknown` 0.000 1.000000
## `cellular.fctr1:carrier.fctrUnknown` -1.536 0.124620
## `cellular.fctrUnknown:carrier.fctrUnknown` -0.150 0.881136
## `cellular.fctr0:carrier.fctrVerizon` 0.000 1.000000
## `cellular.fctr1:carrier.fctrVerizon` 0.448 0.654484
## `cellular.fctrUnknown:carrier.fctrVerizon` 0.000 1.000000
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr2` -1.194 0.232324
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr2` 0.378 0.705253
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr2` -0.970 0.331914
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr2` 1.113 0.265546
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr2` -2.379 0.017344 *
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr2` -0.670 0.502603
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr2` 0.689 0.490854
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr2` 1.438 0.150533
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr2` 0.160 0.873188
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr2` 0.000 1.000000
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr2` -0.255 0.798457
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr3` 3.326 0.000882 ***
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr3` 2.483 0.013031 *
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr3` -0.676 0.499128
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr3` 0.724 0.469197
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr3` -0.122 0.902747
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr3` 1.043 0.296810
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr3` 2.157 0.030980 *
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr3` 0.903 0.366595
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr3` 0.000 1.000000
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr3` 1.586 0.112666
## `prdl.descr.my.fctrUnknown#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctrUnknown#1:.clusterid.fctr4` 0.370 0.711073
## `prdl.descr.my.fctriPad1#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPad1#1:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPad2#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPad2#1:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPad3#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPad3#1:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPad4#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPad4#1:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPadAir#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPadAir#1:.clusterid.fctr4` 1.136 0.255825
## `prdl.descr.my.fctriPadAir2#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPadAir2#1:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPadmini#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPadmini#1:.clusterid.fctr4` -0.191 0.848498
## `prdl.descr.my.fctriPadmini2#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPadmini2#1:.clusterid.fctr4` -0.445 0.656612
## `prdl.descr.my.fctriPadmini3#0:.clusterid.fctr4` 0.000 1.000000
## `prdl.descr.my.fctriPadmini3#1:.clusterid.fctr4` 0.000 1.000000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1053.45 on 1014 degrees of freedom
## Residual deviance: 757.83 on 877 degrees of freedom
## AIC: 1033.8
##
## Number of Fisher Scoring iterations: 18
## sold.fctr sold.fctr.predict.RFE.X.Train.bayesglm.N
## 1 N 663
## 2 Y 71
## sold.fctr.predict.RFE.X.Train.bayesglm.Y
## 1 135
## 2 146
## Prediction
## Reference N Y
## N 663 135
## Y 71 146
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 7.970443e-01 4.548076e-01 7.709628e-01 8.213900e-01 7.862069e-01
## AccuracyPValue McnemarPValue
## 2.114806e-01 1.136544e-05
## id
## 1 RFE.X.Train.bayesglm
## feats
## 1 spdiff.cut.fctr,sprice.log10,sprice.d20nexp,sprice.root2,D.terms.post.stop.n.log,condition.fctr,D.wrds.unq.n.log,D.terms.post.stem.n.log,prdl.descr.my.fctr,D.ratio.wrds.stop.n.wrds.n,D.chrs.n.log,D.chrs.uppr.n.log,D.ratio.weight.sum.wrds.n,startprice.dcm1.is9,D.weight.post.stop.sum,D.weight.sum,D.wrds.n.log,D.weight.post.stem.sum,cellular.fctr,storage.fctr,D.weight.sum.stem.stop.Ratio,startprice.dcm2.is9,color.fctr,D.wrds.stop.n.log,D.chrs.pnct13.n.log,D.chrs.pnct11.n.log,startprice.dgt1.is9,prdl.descr.my.fctr:.clusterid.fctr,cellular.fctr:carrier.fctr
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 6.541 0.548
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.6621796 0.9649123 0.359447 0.8477068
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.3 0.5863454 0.7783198
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.7709628 0.82139 0.2080619
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.01586827 0.04260476
## Warning in glb_get_predictions(df = glb_trnobs_df, mdl_id =
## tail(glb_models_df$id, : Using default probability threshold: 0.3
## Warning in glb_get_predictions(df = glb_newobs_df, mdl_id =
## tail(glb_models_df$id, : Using default probability threshold: 0.3
## [1] "***************"
## [1] "Outliers in obsTrnDF not deleted yet"
## [1] "***************"
## [1] "fitting model: Final.Ensemble.glm"
## [1] " indep_vars: sold.fctr.predict.RFE.X.Train.avNNet.prob,sold.fctr.predict.RFE.X.Train.glm.prob,sold.fctr.predict.RFE.X.Train.glmnet.prob,sold.fctr.predict.RFE.X.Train.nnet.prob,sold.fctr.predict.RFE.X.Train.rpart.prob,sold.fctr.predict.RFE.X.Train.earth.prob,sold.fctr.predict.RFE.X.Train.bayesglm.prob"
## + Fold1.Rep1: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold1.Rep1: parameter=none
## + Fold2.Rep1: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold2.Rep1: parameter=none
## + Fold3.Rep1: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold3.Rep1: parameter=none
## + Fold1.Rep2: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold1.Rep2: parameter=none
## + Fold2.Rep2: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold2.Rep2: parameter=none
## + Fold3.Rep2: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold3.Rep2: parameter=none
## + Fold1.Rep3: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold1.Rep3: parameter=none
## + Fold2.Rep3: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold2.Rep3: parameter=none
## + Fold3.Rep3: parameter=none
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## - Fold3.Rep3: parameter=none
## Aggregating results
## Fitting final model on full training set
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6432 -0.3726 -0.2977 -0.1447 2.7839
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value
## (Intercept) -3.0004 0.3048 -9.844
## sold.fctr.predict.RFE.X.Train.avNNet.prob 12.2798 1.3245 9.271
## sold.fctr.predict.RFE.X.Train.bayesglm.prob -6.7433 2.7057 -2.492
## sold.fctr.predict.RFE.X.Train.earth.prob 3.5396 1.0444 3.389
## sold.fctr.predict.RFE.X.Train.glm.prob 5.5429 2.0403 2.717
## sold.fctr.predict.RFE.X.Train.glmnet.prob -13.1918 2.4746 -5.331
## sold.fctr.predict.RFE.X.Train.nnet.prob 2.9732 0.7013 4.239
## sold.fctr.predict.RFE.X.Train.rpart.prob NA NA NA
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## sold.fctr.predict.RFE.X.Train.avNNet.prob < 2e-16 ***
## sold.fctr.predict.RFE.X.Train.bayesglm.prob 0.012693 *
## sold.fctr.predict.RFE.X.Train.earth.prob 0.000702 ***
## sold.fctr.predict.RFE.X.Train.glm.prob 0.006593 **
## sold.fctr.predict.RFE.X.Train.glmnet.prob 9.77e-08 ***
## sold.fctr.predict.RFE.X.Train.nnet.prob 2.24e-05 ***
## sold.fctr.predict.RFE.X.Train.rpart.prob NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1049.40 on 1011 degrees of freedom
## Residual deviance: 503.91 on 1005 degrees of freedom
## AIC: 517.91
##
## Number of Fisher Scoring iterations: 6
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## sold.fctr sold.fctr.predict.Final.Ensemble.glm.N
## 1 N 753
## 2 Y 54
## sold.fctr.predict.Final.Ensemble.glm.Y
## 1 43
## 2 162
## Prediction
## Reference N Y
## N 753 43
## Y 54 162
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.041502e-01 7.091368e-01 8.843248e-01 9.215851e-01 7.865613e-01
## AccuracyPValue McnemarPValue
## 1.592977e-23 3.099408e-01
## id
## 1 Final.Ensemble.glm
## feats
## 1 sold.fctr.predict.RFE.X.Train.avNNet.prob,sold.fctr.predict.RFE.X.Train.glm.prob,sold.fctr.predict.RFE.X.Train.glmnet.prob,sold.fctr.predict.RFE.X.Train.nnet.prob,sold.fctr.predict.RFE.X.Train.rpart.prob,sold.fctr.predict.RFE.X.Train.earth.prob,sold.fctr.predict.RFE.X.Train.bayesglm.prob
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 1 1.248 0.013
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8212009 0.9711055 0.6712963 0.9325854
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.3 0.7695962 0.9015172
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.8843248 0.9215851 0.681952
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.007922324 0.03208666
## [1] "fitting model: Final.Ensemble.glmnet"
## [1] " indep_vars: sold.fctr.predict.RFE.X.Train.avNNet.prob,sold.fctr.predict.RFE.X.Train.glm.prob,sold.fctr.predict.RFE.X.Train.glmnet.prob,sold.fctr.predict.RFE.X.Train.nnet.prob,sold.fctr.predict.RFE.X.Train.rpart.prob,sold.fctr.predict.RFE.X.Train.earth.prob,sold.fctr.predict.RFE.X.Train.bayesglm.prob"
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 1, lambda = 0.00587 on full training set
## Warning in myfit_mdl(mdl_specs_lst = myinit_mdl_specs_lst(mdl_specs_lst =
## list(id.prefix = ifelse(grepl("Ensemble", : model's bestTune found at an
## extreme of tuneGrid for parameter: alpha
## Length Class Mode
## a0 81 -none- numeric
## beta 567 dgCMatrix S4
## df 81 -none- numeric
## dim 2 -none- numeric
## lambda 81 -none- numeric
## dev.ratio 81 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## classnames 2 -none- character
## call 5 -none- call
## nobs 1 -none- numeric
## lambdaOpt 1 -none- numeric
## xNames 7 -none- character
## problemType 1 -none- character
## tuneValue 2 data.frame list
## obsLevels 2 -none- character
## [1] "min lambda > lambdaOpt:"
## (Intercept)
## -3.1690269
## sold.fctr.predict.RFE.X.Train.avNNet.prob
## 10.1197559
## sold.fctr.predict.RFE.X.Train.earth.prob
## 1.8033363
## sold.fctr.predict.RFE.X.Train.glm.prob
## 0.6229516
## sold.fctr.predict.RFE.X.Train.glmnet.prob
## -9.0987052
## sold.fctr.predict.RFE.X.Train.nnet.prob
## 2.2068229
## [1] "max lambda < lambdaOpt:"
## (Intercept)
## -3.1440529
## sold.fctr.predict.RFE.X.Train.avNNet.prob
## 10.2417424
## sold.fctr.predict.RFE.X.Train.earth.prob
## 1.9224161
## sold.fctr.predict.RFE.X.Train.glm.prob
## 0.6403199
## sold.fctr.predict.RFE.X.Train.glmnet.prob
## -9.5423929
## sold.fctr.predict.RFE.X.Train.nnet.prob
## 2.2431426
## sold.fctr sold.fctr.predict.Final.Ensemble.glmnet.N
## 1 N 766
## 2 Y 62
## sold.fctr.predict.Final.Ensemble.glmnet.Y
## 1 30
## 2 154
## Prediction
## Reference N Y
## N 766 30
## Y 62 154
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 9.090909e-01 7.138009e-01 8.896738e-01 9.260864e-01 7.865613e-01
## AccuracyPValue McnemarPValue
## 1.240076e-25 1.229385e-03
## id
## 1 Final.Ensemble.glmnet
## feats
## 1 sold.fctr.predict.RFE.X.Train.avNNet.prob,sold.fctr.predict.RFE.X.Train.glm.prob,sold.fctr.predict.RFE.X.Train.glmnet.prob,sold.fctr.predict.RFE.X.Train.nnet.prob,sold.fctr.predict.RFE.X.Train.rpart.prob,sold.fctr.predict.RFE.X.Train.earth.prob,sold.fctr.predict.RFE.X.Train.bayesglm.prob
## max.nTuningRuns min.elapsedtime.everything min.elapsedtime.final
## 1 25 2.397 0.038
## max.AUCpROC.fit max.Sens.fit max.Spec.fit max.AUCROCR.fit
## 1 0.8195142 0.9723618 0.6666667 0.9312157
## opt.prob.threshold.fit max.f.score.fit max.Accuracy.fit
## 1 0.4 0.77 0.9044836
## max.AccuracyLower.fit max.AccuracyUpper.fit max.Kappa.fit
## 1 0.8896738 0.9260864 0.6902241
## max.AccuracySD.fit max.KappaSD.fit
## 1 0.01369307 0.04506829
rm(ret_lst)
glb_chunks_df <- myadd_chunk(glb_chunks_df, "fit.data.training", major.inc=FALSE)
## label step_major step_minor label_minor bgn end
## 14 fit.data.training 7 0 0 659.988 918.427
## 15 fit.data.training 7 1 1 918.427 NA
## elapsed
## 14 258.439
## 15 NA
#stop(here"); glb_to_sav()
if (glb_is_classification && glb_is_binomial)
prob_threshold <- glb_models_df[glb_models_df$id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"] else
prob_threshold <- NULL
if (grepl("Ensemble", glb_fin_mdl_id)) {
# Get predictions for each model in ensemble; Outliers that have been moved to OOB might not have been predicted yet
mdlEnsembleComps <- unlist(str_split(subset(glb_models_df,
id == glb_fin_mdl_id)$feats, ","))
if (glb_is_classification && glb_is_binomial)
mdlEnsembleComps <- gsub("\\.prob$", "", mdlEnsembleComps)
mdlEnsembleComps <- gsub(paste0("^",
gsub(".", "\\.", glb_rsp_var_out, fixed = TRUE)),
"", mdlEnsembleComps)
for (mdl_id in mdlEnsembleComps) {
glb_trnobs_df <- glb_get_predictions(df=glb_trnobs_df, mdl_id=mdl_id,
rsp_var_out=glb_rsp_var_out,
prob_threshold_def=prob_threshold)
glb_newobs_df <- glb_get_predictions(df=glb_newobs_df, mdl_id=mdl_id,
rsp_var_out=glb_rsp_var_out,
prob_threshold_def=prob_threshold)
}
}
## Warning in glb_get_predictions(df = glb_trnobs_df, mdl_id = mdl_id,
## rsp_var_out = glb_rsp_var_out, : Using default probability threshold: 0.4
## Warning in glb_get_predictions(df = glb_newobs_df, mdl_id = mdl_id,
## rsp_var_out = glb_rsp_var_out, : Using default probability threshold: 0.4
## Warning in glb_get_predictions(df = glb_trnobs_df, mdl_id = mdl_id,
## rsp_var_out = glb_rsp_var_out, : Using default probability threshold: 0.4
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in glb_get_predictions(df = glb_newobs_df, mdl_id = mdl_id,
## rsp_var_out = glb_rsp_var_out, : Using default probability threshold: 0.4
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in glb_get_predictions(df = glb_trnobs_df, mdl_id = mdl_id,
## rsp_var_out = glb_rsp_var_out, : Using default probability threshold: 0.4
## Warning in glb_get_predictions(df = glb_newobs_df, mdl_id = mdl_id,
## rsp_var_out = glb_rsp_var_out, : Using default probability threshold: 0.4
## Warning in glb_get_predictions(df = glb_trnobs_df, mdl_id = mdl_id,
## rsp_var_out = glb_rsp_var_out, : Using default probability threshold: 0.4
## Warning in glb_get_predictions(df = glb_newobs_df, mdl_id = mdl_id,
## rsp_var_out = glb_rsp_var_out, : Using default probability threshold: 0.4
## Warning in glb_get_predictions(df = glb_trnobs_df, mdl_id = mdl_id,
## rsp_var_out = glb_rsp_var_out, : Using default probability threshold: 0.4
## Warning in glb_get_predictions(df = glb_newobs_df, mdl_id = mdl_id,
## rsp_var_out = glb_rsp_var_out, : Using default probability threshold: 0.4
## Warning in glb_get_predictions(df = glb_trnobs_df, mdl_id = mdl_id,
## rsp_var_out = glb_rsp_var_out, : Using default probability threshold: 0.4
## Warning in glb_get_predictions(df = glb_newobs_df, mdl_id = mdl_id,
## rsp_var_out = glb_rsp_var_out, : Using default probability threshold: 0.4
## Warning in glb_get_predictions(df = glb_trnobs_df, mdl_id = mdl_id,
## rsp_var_out = glb_rsp_var_out, : Using default probability threshold: 0.4
## Warning in glb_get_predictions(df = glb_newobs_df, mdl_id = mdl_id,
## rsp_var_out = glb_rsp_var_out, : Using default probability threshold: 0.4
glb_trnobs_df <- glb_get_predictions(df=glb_trnobs_df, mdl_id=glb_fin_mdl_id,
rsp_var_out=glb_rsp_var_out,
prob_threshold_def=prob_threshold)
## Warning in glb_get_predictions(df = glb_trnobs_df, mdl_id =
## glb_fin_mdl_id, : Using default probability threshold: 0.4
glb_featsimp_df <- myget_feats_importance(mdl=glb_fin_mdl,
featsimp_df=glb_featsimp_df)
glb_featsimp_df[, paste0(glb_fin_mdl_id, ".importance")] <- glb_featsimp_df$importance
print(glb_featsimp_df)
## Ensemble.glmnet.importance
## sold.fctr.predict.RFE.X.Train.avNNet.prob NA
## sold.fctr.predict.RFE.X.Train.nnet.prob NA
## sold.fctr.predict.RFE.X.Train.earth.prob NA
## sold.fctr.predict.RFE.X.Train.glm.prob NA
## sold.fctr.predict.RFE.X.Train.bayesglm.prob NA
## sold.fctr.predict.RFE.X.Train.rpart.prob NA
## sold.fctr.predict.RFE.X.Train.glmnet.prob NA
## sold.fctr.predict.RFE.X.avNNet.prob 100.00000
## sold.fctr.predict.RFE.X.bayesglm.prob 21.01501
## sold.fctr.predict.RFE.X.earth.prob 32.26855
## sold.fctr.predict.RFE.X.gbm.prob 0.00000
## sold.fctr.predict.RFE.X.glm.prob 60.59569
## sold.fctr.predict.RFE.X.glmnet.prob 56.02422
## sold.fctr.predict.RFE.X.nnet.prob 46.25126
## sold.fctr.predict.RFE.X.rpart.prob 40.27596
## importance
## sold.fctr.predict.RFE.X.Train.avNNet.prob 100.00000
## sold.fctr.predict.RFE.X.Train.nnet.prob 58.86597
## sold.fctr.predict.RFE.X.Train.earth.prob 56.79193
## sold.fctr.predict.RFE.X.Train.glm.prob 50.63200
## sold.fctr.predict.RFE.X.Train.bayesglm.prob 47.39083
## sold.fctr.predict.RFE.X.Train.rpart.prob 47.39083
## sold.fctr.predict.RFE.X.Train.glmnet.prob 0.00000
## sold.fctr.predict.RFE.X.avNNet.prob NA
## sold.fctr.predict.RFE.X.bayesglm.prob NA
## sold.fctr.predict.RFE.X.earth.prob NA
## sold.fctr.predict.RFE.X.gbm.prob NA
## sold.fctr.predict.RFE.X.glm.prob NA
## sold.fctr.predict.RFE.X.glmnet.prob NA
## sold.fctr.predict.RFE.X.nnet.prob NA
## sold.fctr.predict.RFE.X.rpart.prob NA
## Final.Ensemble.glmnet.importance
## sold.fctr.predict.RFE.X.Train.avNNet.prob 100.00000
## sold.fctr.predict.RFE.X.Train.nnet.prob 58.86597
## sold.fctr.predict.RFE.X.Train.earth.prob 56.79193
## sold.fctr.predict.RFE.X.Train.glm.prob 50.63200
## sold.fctr.predict.RFE.X.Train.bayesglm.prob 47.39083
## sold.fctr.predict.RFE.X.Train.rpart.prob 47.39083
## sold.fctr.predict.RFE.X.Train.glmnet.prob 0.00000
## sold.fctr.predict.RFE.X.avNNet.prob NA
## sold.fctr.predict.RFE.X.bayesglm.prob NA
## sold.fctr.predict.RFE.X.earth.prob NA
## sold.fctr.predict.RFE.X.gbm.prob NA
## sold.fctr.predict.RFE.X.glm.prob NA
## sold.fctr.predict.RFE.X.glmnet.prob NA
## sold.fctr.predict.RFE.X.nnet.prob NA
## sold.fctr.predict.RFE.X.rpart.prob NA
if (glb_is_classification && glb_is_binomial)
glb_analytics_diag_plots(obs_df=glb_trnobs_df, mdl_id=glb_fin_mdl_id,
prob_threshold=glb_models_df[glb_models_df$id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"]) else
glb_analytics_diag_plots(obs_df=glb_trnobs_df, mdl_id=glb_fin_mdl_id)
## Warning in glb_analytics_diag_plots(obs_df = glb_trnobs_df, mdl_id =
## glb_fin_mdl_id, : No features in selected model are statistically important
dsp_feats_vctr <- c(NULL)
for(var in grep(".importance", names(glb_feats_df), fixed=TRUE, value=TRUE))
dsp_feats_vctr <- union(dsp_feats_vctr,
glb_feats_df[!is.na(glb_feats_df[, var]), "id"])
# print(glb_trnobs_df[glb_trnobs_df$UniqueID %in% FN_OOB_ids,
# grep(glb_rsp_var, names(glb_trnobs_df), value=TRUE)])
print(setdiff(names(glb_trnobs_df), names(glb_allobs_df)))
## [1] "sold.fctr.predict.RFE.X.Train.avNNet.prob"
## [2] "sold.fctr.predict.RFE.X.Train.avNNet"
## [3] "sold.fctr.predict.RFE.X.Train.avNNet.err"
## [4] "sold.fctr.predict.RFE.X.Train.avNNet.err.abs"
## [5] "sold.fctr.predict.RFE.X.Train.avNNet.accurate"
## [6] "sold.fctr.predict.RFE.X.Train.glm.prob"
## [7] "sold.fctr.predict.RFE.X.Train.glm"
## [8] "sold.fctr.predict.RFE.X.Train.glm.err"
## [9] "sold.fctr.predict.RFE.X.Train.glm.err.abs"
## [10] "sold.fctr.predict.RFE.X.Train.glm.accurate"
## [11] "sold.fctr.predict.RFE.X.Train.glmnet.prob"
## [12] "sold.fctr.predict.RFE.X.Train.glmnet"
## [13] "sold.fctr.predict.RFE.X.Train.glmnet.err"
## [14] "sold.fctr.predict.RFE.X.Train.glmnet.err.abs"
## [15] "sold.fctr.predict.RFE.X.Train.glmnet.accurate"
## [16] "sold.fctr.predict.RFE.X.Train.nnet.prob"
## [17] "sold.fctr.predict.RFE.X.Train.nnet"
## [18] "sold.fctr.predict.RFE.X.Train.nnet.err"
## [19] "sold.fctr.predict.RFE.X.Train.nnet.err.abs"
## [20] "sold.fctr.predict.RFE.X.Train.nnet.accurate"
## [21] "sold.fctr.predict.RFE.X.Train.rpart.prob"
## [22] "sold.fctr.predict.RFE.X.Train.rpart"
## [23] "sold.fctr.predict.RFE.X.Train.rpart.err"
## [24] "sold.fctr.predict.RFE.X.Train.rpart.err.abs"
## [25] "sold.fctr.predict.RFE.X.Train.rpart.accurate"
## [26] "sold.fctr.predict.RFE.X.Train.earth.prob"
## [27] "sold.fctr.predict.RFE.X.Train.earth"
## [28] "sold.fctr.predict.RFE.X.Train.earth.err"
## [29] "sold.fctr.predict.RFE.X.Train.earth.err.abs"
## [30] "sold.fctr.predict.RFE.X.Train.earth.accurate"
## [31] "sold.fctr.predict.RFE.X.Train.bayesglm.prob"
## [32] "sold.fctr.predict.RFE.X.Train.bayesglm"
## [33] "sold.fctr.predict.RFE.X.Train.bayesglm.err"
## [34] "sold.fctr.predict.RFE.X.Train.bayesglm.err.abs"
## [35] "sold.fctr.predict.RFE.X.Train.bayesglm.accurate"
## [36] "sold.fctr.predict.Final.Ensemble.glmnet.prob"
## [37] "sold.fctr.predict.Final.Ensemble.glmnet"
## [38] "sold.fctr.predict.Final.Ensemble.glmnet.err"
## [39] "sold.fctr.predict.Final.Ensemble.glmnet.err.abs"
## [40] "sold.fctr.predict.Final.Ensemble.glmnet.accurate"
for (col in setdiff(names(glb_trnobs_df), names(glb_allobs_df)))
# Merge or cbind ?
glb_allobs_df[glb_allobs_df$.src == "Train", col] <- glb_trnobs_df[, col]
print(setdiff(names(glb_fitobs_df), names(glb_allobs_df)))
## character(0)
print(setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
## character(0)
for (col in setdiff(names(glb_OOBobs_df), names(glb_allobs_df)))
# Merge or cbind ?
glb_allobs_df[glb_allobs_df$.lcn == "OOB", col] <- glb_OOBobs_df[, col]
print(setdiff(names(glb_newobs_df), names(glb_allobs_df)))
## character(0)
if (glb_save_envir)
save(glb_feats_df, glb_allobs_df,
#glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
glb_models_df, dsp_models_df, glb_models_lst, glb_model_type,
glb_sel_mdl, glb_sel_mdl_id,
glb_fin_mdl, glb_fin_mdl_id,
file=paste0(glb_out_pfx, "dsk.RData"))
replay.petrisim(pn=glb_analytics_pn,
replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
"data.training.all.prediction","model.final")), flip_coord=TRUE)
## time trans "bgn " "fit.data.training.all " "predict.data.new " "end "
## 0.0000 multiple enabled transitions: data.training.all data.new model.selected firing: data.training.all
## 1.0000 1 2 1 0 0
## 1.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction firing: data.new
## 2.0000 2 1 1 1 0
## 2.0000 multiple enabled transitions: data.training.all data.new model.selected model.final data.training.all.prediction data.new.prediction firing: model.selected
## 3.0000 3 0 2 1 0
## 3.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: data.training.all.prediction
## 4.0000 5 0 1 1 1
## 4.0000 multiple enabled transitions: model.final data.training.all.prediction data.new.prediction firing: model.final
## 5.0000 4 0 0 2 1
glb_chunks_df <- myadd_chunk(glb_chunks_df, "predict.data.new", major.inc=TRUE)
## label step_major step_minor label_minor bgn end
## 15 fit.data.training 7 1 1 918.427 927.165
## 16 predict.data.new 8 0 0 927.165 NA
## elapsed
## 15 8.738
## 16 NA
8.0: predict data new# Compute final model predictions
#glb_to_sav(); all.equal(sav_allobs_df, glb_allobs_df); all.equal(sav_trnobs_df, glb_trnobs_df); all.equal(sav_newobs_df, glb_newobs_df)
if (glb_is_classification && glb_is_binomial)
prob_threshold_def <-
glb_models_df[glb_models_df$id == glb_sel_mdl_id, "opt.prob.threshold.OOB"] else
prob_threshold_def <- NULL
for (obsSet in c("trn", "new")) {
obs_df <- switch(obsSet, all = glb_allobs_df, trn = glb_trnobs_df, new = glb_newobs_df)
obs_df <- glb_get_predictions(obs_df, mdl_id = glb_fin_mdl_id,
rsp_var_out = glb_rsp_var_out, prob_threshold_def = prob_threshold_def)
if (obsSet == "all") glb_allobs_df <- obs_df else
if (obsSet == "trn") glb_trnobs_df <- obs_df else
if (obsSet == "new") glb_newobs_df <- obs_df
}
## Warning in glb_get_predictions(obs_df, mdl_id = glb_fin_mdl_id, rsp_var_out
## = glb_rsp_var_out, : Using default probability threshold: 0.4
## Warning in glb_get_predictions(obs_df, mdl_id = glb_fin_mdl_id, rsp_var_out
## = glb_rsp_var_out, : Using default probability threshold: 0.4
rm(obs_df)
glb_allobs_df <- orderBy(reformulate(glb_id_var), myrbind_df(glb_trnobs_df, glb_newobs_df))
glb_analytics_diag_plots(obs_df = glb_newobs_df, mdl_id = glb_fin_mdl_id,
prob_threshold = prob_threshold_def)
## Warning in glb_analytics_diag_plots(obs_df = glb_newobs_df, mdl_id =
## glb_fin_mdl_id, : No features in selected model are statistically important
if (is.null(glb_out_obs)) obs_df <- glb_newobs_df else
obs_df <- switch(glb_out_obs,
all = glb_allobs_df, trn = glb_trnobs_df, new = glb_newobs_df)
require(stringr)
if (glb_is_classification && glb_is_binomial) {
# submit_df <- glb_newobs_df[, c(glb_id_var,
# paste0(glb_rsp_var_out, glb_fin_mdl_id, ".prob"))]
# names(submit_df)[2] <- "Probability1"
obsout_df <- obs_df[, glb_id_var, FALSE]
for (clmn in names(glb_out_vars_lst))
if (!grepl("^%<d-%", glb_out_vars_lst[[clmn]]))
obsout_df[, clmn] <- obs_df[, glb_out_vars_lst[[clmn]]] else {
feat <- str_trim(unlist(strsplit(glb_out_vars_lst[[clmn]], "%<d-%"))[2])
obsout_df[, clmn] <- obs_df[, eval(parse(text = feat))]
}
glb_force_prediction_lst <- list()
glb_force_prediction_lst[["0"]] <- c(11885, 11907, 11932, 11943,
12050, 12115, 12171,
12253, 12285, 12367, 12388, 12399,
12585)
for (obs_id in glb_force_prediction_lst[["0"]]) {
if (sum(glb_allobs_df[, glb_id_var] == obs_id) == 0)
next
if (is.na(glb_allobs_df[glb_allobs_df[, glb_id_var] == obs_id, ".grpid"]))
stop(".grpid is NA")
# submit_df[submit_df[, glb_id_var] == obs_id, "Probability1"] <-
# max(0, submit_df[submit_df[, glb_id_var] == obs_id, "Probability1"] - 0.5)
}
glb_force_prediction_lst[["1"]] <- c(11871, 11875, 11886,
11913, 11931, 11937, 11967, 11982, 11990, 11991, 11994, 11999,
12000, 12002, 12004, 12018, 12021, 12065, 12072,
12111, 12114, 12126, 12134, 12152, 12172,
12213, 12214, 12233, 12265, 12278, 12299,
12446, 12491,
12505, 12576, 12608, 12630)
for (obs_id in glb_force_prediction_lst[["1"]]) {
if (sum(glb_allobs_df[, glb_id_var] == obs_id) == 0)
next
if (is.na(glb_allobs_df[glb_allobs_df[, glb_id_var] == obs_id, ".grpid"]))
stop(".grpid is NA")
# submit_df[submit_df[, glb_id_var] == obs_id, "Probability1"] <-
# min(0.9999, submit_df[submit_df[, glb_id_var] == obs_id, "Probability1"] + 0.5)
}
rsp_var_out <- paste0(glb_rsp_var_out, glb_fin_mdl_id)
for (obs_id in glb_newobs_df[!is.na(glb_newobs_df[, rsp_var_out]) &
(glb_newobs_df[, rsp_var_out] == "Y") &
(glb_newobs_df[ , "startprice"] > 675), "UniqueID"]) {
# submit_df[submit_df[, glb_id_var] == obs_id, "Probability1"] <-
# max(0, submit_df[submit_df[, glb_id_var] == obs_id, "Probability1"] - 0.5)
}
} else {
# submit_df <- glb_newobs_df[, c(glb_id_var,
# paste0(glb_rsp_var_out, glb_fin_mdl_id))]
obsout_df <- obs_df[, glb_id_var, FALSE]
for (clmn in names(glb_out_vars_lst))
if (!grepl("^%<d-%", glb_out_vars_lst[[clmn]]))
obsout_df[, clmn] <- obs_df[, glb_out_vars_lst[[clmn]]] else {
feat <- str_trim(unlist(strsplit(glb_out_vars_lst[[clmn]], "%<d-%"))[2])
obsout_df[, clmn] <- obs_df[, eval(parse(text=feat))]
}
}
if (glb_is_classification) {
rsp_var_out <- paste0(glb_rsp_var_out, glb_fin_mdl_id)
tmp_newobs_df <- subset(glb_newobs_df[, c(glb_id_var, ".grpid", rsp_var_out)],
!is.na(.grpid))
tmp_newobs_df <- merge(tmp_newobs_df, dupgrps_df, by = ".grpid", all.x = TRUE)
tmp_newobs_df <- merge(tmp_newobs_df, obsout_df, by = glb_id_var, all.x = TRUE)
tmp_newobs_df$.err <-
((tmp_newobs_df$Probability1 > 0.5) & (tmp_newobs_df$sold.0 > 0) |
(tmp_newobs_df$Probability1 < 0.5) & (tmp_newobs_df$sold.1 > 0))
tmp_newobs_df <- orderBy(~UniqueID, subset(tmp_newobs_df, .err == TRUE))
print(sprintf("Prediction errors in duplicates: %d", nrow(tmp_newobs_df)))
print(tmp_newobs_df)
# if (nrow(tmp_newobs_df) > 0)
# stop("check Prediction errors in duplicates")
#print(dupobs_df[dupobs_df$.grpid == 26, ])
tmp_newobs_df <- cbind(glb_newobs_df, obsout_df[, "Probability1", FALSE])
# if (max(glb_newobs_df[!is.na(glb_newobs_df[, rsp_var_out]) &
# (tmp_newobs_df[, "Probability1"] >= 0.5), "startprice"]) >
# max(glb_allobs_df[!is.na(glb_allobs_df[, glb_rsp_var]) &
# (glb_allobs_df[, glb_rsp_var] == "Y"), "startprice"]))
# stop("startprice for some +ve predictions > 675")
# Check predictions that are outside of data ranges
#stop(here")
require(stringr)
tmp_feats_df <- subset(glb_feats_df,
!nzv &
(exclude.as.feat != 1) &
!grepl(".fctr", id, fixed=TRUE))[, "id", FALSE]
ranges_all_df <- glb_allobs_df[, tmp_feats_df$id] %>%
dplyr::summarise_each(funs(min(., na.rm=TRUE),
max(., na.rm=TRUE))) %>%
tidyr::gather() %>%
dplyr::mutate(id=str_sub(key, 1, -5),
stat=str_sub(key, -3)) %>%
dplyr::select(-key) %>%
tidyr::spread(stat, value)
# sav_ranges_trn_df <- ranges_trn_df; all.equal(sav_ranges_trn_df, ranges_trn_df)
# sav_ranges_new_df <- ranges_new_df; all.equal(sav_ranges_new_df, ranges_new_df)
get_ranges_df <- function(obs_df, feats, class_var) {
require(tidyr)
ranges_df <- obs_df[, c(class_var, feats)] %>%
dplyr::group_by_(class_var) %>%
dplyr::summarise_each(funs(min(., na.rm=TRUE),
max(., na.rm=TRUE))) %>%
tidyr::gather(key, value, -1) %>%
mutate(id=str_sub(key, 1, -5),
stat.vname=paste0(str_sub(key, -3), ".", class_var)) %>%
unite_("stat.class", c("stat.vname", class_var), sep=".") %>%
dplyr::select(-key) %>%
spread(stat.class, value)
return(ranges_df)
}
rsp_var_out_OOB <- paste0(glb_rsp_var_out, glb_sel_mdl_id)
rsp_var_out_new <- paste0(glb_rsp_var_out, glb_fin_mdl_id)
ranges_trn_df <- get_ranges_df(obs_df=glb_trnobs_df, feats=tmp_feats_df$id,
class_var=glb_rsp_var)
ranges_fit_df <- get_ranges_df(obs_df=glb_fitobs_df, feats=tmp_feats_df$id,
class_var=glb_rsp_var)
ranges_OOB_df <- get_ranges_df(obs_df=glb_OOBobs_df, feats=tmp_feats_df$id,
class_var=rsp_var_out_OOB)
ranges_new_df <- get_ranges_df(obs_df=glb_newobs_df, feats=tmp_feats_df$id,
class_var=rsp_var_out_new)
for (obsset in c("OOB", "new")) {
if (obsset == "OOB") {
ranges_ref_df <- ranges_fit_df; obs_df <- glb_OOBobs_df;
rsp_var_out_obs <- rsp_var_out_OOB; sprintf_pfx <- "OOBobs";
} else {
ranges_ref_df <- ranges_trn_df; obs_df <- glb_newobs_df;
rsp_var_out_obs <- rsp_var_out_new; sprintf_pfx <- "newobs";
}
plt_feats_df <- glb_feats_df %>%
merge(ranges_all_df, all=TRUE) %>%
merge(ranges_ref_df, all=TRUE) %>%
merge(ranges_OOB_df, all=TRUE) %>%
merge(ranges_new_df, all=TRUE) %>%
subset(!is.na(min) & (id != ".rnorm"))
row.names(plt_feats_df) <- plt_feats_df$id
range_outlier_ids <- c(NULL)
for (clss in unique(obs_df[, rsp_var_out_obs])) {
for (stat in c("min", "max")) {
if (stat == "min") {
dsp_feats <- plt_feats_df[
which(plt_feats_df[, paste("min", rsp_var_out_obs, clss, sep=".")] <
plt_feats_df[, paste("min", glb_rsp_var, clss, sep=".")]), "id"]
} else {
dsp_feats <- plt_feats_df[
which(plt_feats_df[, paste("max", rsp_var_out_obs, clss, sep=".")] >
plt_feats_df[, paste("max", glb_rsp_var, clss, sep=".")]), "id"]
}
if (length(dsp_feats) > 0) {
ths_ids <- c(NULL)
for (feat in dsp_feats) {
if (stat == "min") {
ths_ids <- union(ths_ids,
obs_df[(obs_df[, rsp_var_out_obs] == clss) &
(obs_df[, feat] <
plt_feats_df[plt_feats_df$id == feat, paste("min", glb_rsp_var, clss, sep=".")]),
glb_id_var])
} else {
ths_ids <- union(ths_ids,
obs_df[(obs_df[, rsp_var_out_obs] == clss) &
(obs_df[, feat] >
plt_feats_df[plt_feats_df$id == feat, paste("max", glb_rsp_var, clss, sep=".")]),
glb_id_var])
}
}
tmp_obs_df <- obs_df[obs_df[, glb_id_var] %in% ths_ids,
c(glb_id_var, rsp_var_out_obs, dsp_feats)]
if (stat == "min") {
print(sprintf("%s %s %s: min < min of Train range: %d",
sprintf_pfx, rsp_var_out_obs, clss, nrow(tmp_obs_df)))
} else {
print(sprintf("%s %s %s: max > max of Train range: %d",
sprintf_pfx, rsp_var_out_obs, clss, nrow(tmp_obs_df)))
}
myprint_df(tmp_obs_df)
print(subset(plt_feats_df, id %in% dsp_feats))
range_outlier_ids <- union(range_outlier_ids, ths_ids)
}
}
}
print(sprintf("%s total range outliers: %d", sprintf_pfx, length(range_outlier_ids)))
}
}
## [1] "Prediction errors in duplicates: 2"
## UniqueID .grpid sold.fctr.predict.Final.Ensemble.glmnet sold.0 sold.1
## 33 12446 48 N 0 1
## 36 12576 41 N 0 1
## sold.NA .freq Probability1 .err
## 33 1 2 0.1309766 TRUE
## 36 1 2 0.1767807 TRUE
## [1] "OOBobs sold.fctr.predict.Ensemble.glmnet N: min < min of Train range: 7"
## UniqueID sold.fctr.predict.Ensemble.glmnet D.ratio.wrds.stop.n.wrds.n
## 866 11591 N 1.00000000
## 880 11610 N 0.33333333
## 1001 11846 N 0.28571429
## 710 11397 N 1.00000000
## 819 11543 N 0.04347826
## 378 10918 N 0.04347826
## 469 11062 N 0.11111111
## D.weight.sum.stem.stop.Ratio sprice.d20nexp sprice.log10 sprice.root2
## 866 1.0000000 8.611384e-01 1.095273 1.729162
## 880 0.7853552 1.504191e-03 4.867458 11.401316
## 1001 0.7944027 3.726653e-06 5.521461 15.811388
## 710 1.0000000 2.027639e-22 6.906755 31.606961
## 819 0.9801883 2.753645e-05 5.347108 14.491377
## 378 0.9754964 1.585406e-04 5.164729 13.228379
## 469 0.8031306 6.459233e-06 5.476464 15.459625
## id cor.y
## D.ratio.wrds.stop.n.wrds.n D.ratio.wrds.stop.n.wrds.n 0.032325360
## D.weight.sum.stem.stop.Ratio D.weight.sum.stem.stop.Ratio -0.003205858
## sprice.d20nexp sprice.d20nexp 0.050688215
## sprice.log10 sprice.log10 -0.150032840
## sprice.root2 sprice.root2 -0.158256112
## exclude.as.feat cor.y.abs
## D.ratio.wrds.stop.n.wrds.n FALSE 0.032325360
## D.weight.sum.stem.stop.Ratio FALSE 0.003205858
## sprice.d20nexp FALSE 0.050688215
## sprice.log10 FALSE 0.150032840
## sprice.root2 FALSE 0.158256112
## cor.high.X freqRatio
## D.ratio.wrds.stop.n.wrds.n D.terms.post.stop.n.log 16.14706
## D.weight.sum.stem.stop.Ratio <NA> 33.41176
## sprice.d20nexp <NA> 1.31250
## sprice.log10 sprice.root2 1.31250
## sprice.root2 <NA> 1.31250
## percentUnique zeroVar nzv is.cor.y.abs.low
## D.ratio.wrds.stop.n.wrds.n 6.502463 FALSE FALSE FALSE
## D.weight.sum.stem.stop.Ratio 34.975369 FALSE FALSE TRUE
## sprice.d20nexp 46.206897 FALSE FALSE FALSE
## sprice.log10 46.206897 FALSE FALSE FALSE
## sprice.root2 46.206897 FALSE FALSE FALSE
## interaction.feat shapiro.test.p.value
## D.ratio.wrds.stop.n.wrds.n <NA> 3.630249e-38
## D.weight.sum.stem.stop.Ratio <NA> 3.630602e-42
## sprice.d20nexp <NA> 5.382519e-55
## sprice.log10 <NA> 6.612030e-24
## sprice.root2 <NA> 7.924585e-02
## rsp_var_raw id_var rsp_var max
## D.ratio.wrds.stop.n.wrds.n FALSE NA NA 1.0000000
## D.weight.sum.stem.stop.Ratio FALSE NA NA 1.0000000
## sprice.d20nexp FALSE NA NA 0.8611384
## sprice.log10 FALSE NA NA 6.9077453
## sprice.root2 FALSE NA NA 31.6226185
## min max.sold.fctr.N max.sold.fctr.Y
## D.ratio.wrds.stop.n.wrds.n 4.347826e-02 1.0000000 1.0000000
## D.weight.sum.stem.stop.Ratio 7.853552e-01 1.0000000 1.0000000
## sprice.d20nexp 1.929714e-22 0.8191402 0.7909662
## sprice.log10 1.095273e+00 6.8553877 6.4216060
## sprice.root2 1.729162e+00 30.8055190 24.7989919
## min.sold.fctr.N min.sold.fctr.Y
## D.ratio.wrds.stop.n.wrds.n 5.000000e-02 4.761905e-02
## D.weight.sum.stem.stop.Ratio 8.077120e-01 8.083530e-01
## sprice.d20nexp 2.472641e-21 4.422439e-14
## sprice.log10 1.383791e+00 1.545433e+00
## sprice.root2 1.997498e+00 2.165641e+00
## max.sold.fctr.predict.Ensemble.glmnet.N
## D.ratio.wrds.stop.n.wrds.n 1.0000000
## D.weight.sum.stem.stop.Ratio 1.0000000
## sprice.d20nexp 0.8611384
## sprice.log10 6.9067548
## sprice.root2 31.6069613
## max.sold.fctr.predict.Ensemble.glmnet.Y
## D.ratio.wrds.stop.n.wrds.n 1.0000000
## D.weight.sum.stem.stop.Ratio 1.0000000
## sprice.d20nexp 0.3680634
## sprice.log10 6.3543700
## sprice.root2 23.9791576
## min.sold.fctr.predict.Ensemble.glmnet.N
## D.ratio.wrds.stop.n.wrds.n 4.347826e-02
## D.weight.sum.stem.stop.Ratio 7.853552e-01
## sprice.d20nexp 2.027639e-22
## sprice.log10 1.095273e+00
## sprice.root2 1.729162e+00
## min.sold.fctr.predict.Ensemble.glmnet.Y
## D.ratio.wrds.stop.n.wrds.n 4.761905e-02
## D.weight.sum.stem.stop.Ratio 8.309536e-01
## sprice.d20nexp 3.266131e-13
## sprice.log10 2.995232e+00
## sprice.root2 4.471018e+00
## max.sold.fctr.predict.Final.Ensemble.glmnet.N
## D.ratio.wrds.stop.n.wrds.n 1.0000000
## D.weight.sum.stem.stop.Ratio 1.0000000
## sprice.d20nexp 0.7538966
## sprice.log10 6.9077453
## sprice.root2 31.6226185
## max.sold.fctr.predict.Final.Ensemble.glmnet.Y
## D.ratio.wrds.stop.n.wrds.n 1.00000000
## D.weight.sum.stem.stop.Ratio 1.00000000
## sprice.d20nexp 0.08212605
## sprice.log10 6.32614947
## sprice.root2 23.64318084
## min.sold.fctr.predict.Final.Ensemble.glmnet.N
## D.ratio.wrds.stop.n.wrds.n 5.000000e-02
## D.weight.sum.stem.stop.Ratio 8.562469e-01
## sprice.d20nexp 1.929714e-22
## sprice.log10 1.731656e+00
## sprice.root2 2.376973e+00
## min.sold.fctr.predict.Final.Ensemble.glmnet.Y
## D.ratio.wrds.stop.n.wrds.n 5.263158e-02
## D.weight.sum.stem.stop.Ratio 8.306777e-01
## sprice.d20nexp 7.268909e-13
## sprice.log10 3.911823e+00
## sprice.root2 7.070361e+00
## [1] "OOBobs sold.fctr.predict.Ensemble.glmnet N: max > max of Train range: 7"
## UniqueID sold.fctr.predict.Ensemble.glmnet D.chrs.pnct11.n.log
## 866 11591 N 0.0000000
## 546 11172 N 0.0000000
## 640 11301 N 0.0000000
## 710 11397 N 0.0000000
## 819 11543 N 0.0000000
## 845 11570 N 1.9459101
## 378 10918 N 0.6931472
## D.chrs.pnct13.n.log D.chrs.uppr.n.log D.terms.post.stem.n.log
## 866 0.0000000 0.000000 0.000000
## 546 1.9459101 4.290459 2.772589
## 640 0.6931472 4.465908 2.564949
## 710 0.0000000 0.000000 0.000000
## 819 0.0000000 4.343805 2.944439
## 845 0.0000000 4.356709 2.397895
## 378 0.6931472 4.356709 3.044522
## D.terms.post.stop.n.log D.wrds.n.log D.wrds.unq.n.log sprice.d20nexp
## 866 0.000000 0.000000 0.000000 8.611384e-01
## 546 2.772589 2.995732 2.772589 5.533610e-04
## 640 2.564949 2.639057 2.564949 1.445703e-07
## 710 0.000000 0.000000 0.000000 2.027639e-22
## 819 2.944439 3.135494 2.944439 2.753645e-05
## 845 2.397895 2.708050 2.397895 3.726653e-06
## 378 3.044522 3.135494 3.044522 1.585406e-04
## sprice.log10 sprice.root2
## 866 1.095273 1.729162
## 546 5.010569 12.247040
## 640 5.752541 17.747958
## 710 6.906755 31.606961
## 819 5.347108 14.491377
## 845 5.521461 15.811388
## 378 5.164729 13.228379
## id cor.y
## D.chrs.pnct11.n.log D.chrs.pnct11.n.log -0.03440286
## D.chrs.pnct13.n.log D.chrs.pnct13.n.log -0.04148065
## D.chrs.uppr.n.log D.chrs.uppr.n.log -0.03219346
## D.terms.post.stem.n.log D.terms.post.stem.n.log -0.03316050
## D.terms.post.stop.n.log D.terms.post.stop.n.log -0.03375218
## D.wrds.n.log D.wrds.n.log -0.02925929
## D.wrds.unq.n.log D.wrds.unq.n.log -0.03316050
## sprice.d20nexp sprice.d20nexp 0.05068822
## sprice.log10 sprice.log10 -0.15003284
## sprice.root2 sprice.root2 -0.15825611
## exclude.as.feat cor.y.abs cor.high.X
## D.chrs.pnct11.n.log FALSE 0.03440286 <NA>
## D.chrs.pnct13.n.log FALSE 0.04148065 <NA>
## D.chrs.uppr.n.log FALSE 0.03219346 D.chrs.n.log
## D.terms.post.stem.n.log FALSE 0.03316050 D.terms.post.stop.n.log
## D.terms.post.stop.n.log FALSE 0.03375218 <NA>
## D.wrds.n.log FALSE 0.02925929 D.terms.post.stop.n.log
## D.wrds.unq.n.log FALSE 0.03316050 D.terms.post.stem.n.log
## sprice.d20nexp FALSE 0.05068822 <NA>
## sprice.log10 FALSE 0.15003284 sprice.root2
## sprice.root2 FALSE 0.15825611 <NA>
## freqRatio percentUnique zeroVar nzv
## D.chrs.pnct11.n.log 9.797753 0.6896552 FALSE FALSE
## D.chrs.pnct13.n.log 4.664516 0.5911330 FALSE FALSE
## D.chrs.uppr.n.log 12.477273 7.0935961 FALSE FALSE
## D.terms.post.stem.n.log 8.318182 2.0689655 FALSE FALSE
## D.terms.post.stop.n.log 10.557692 2.0689655 FALSE FALSE
## D.wrds.n.log 8.546875 2.2660099 FALSE FALSE
## D.wrds.unq.n.log 8.318182 2.0689655 FALSE FALSE
## sprice.d20nexp 1.312500 46.2068966 FALSE FALSE
## sprice.log10 1.312500 46.2068966 FALSE FALSE
## sprice.root2 1.312500 46.2068966 FALSE FALSE
## is.cor.y.abs.low interaction.feat
## D.chrs.pnct11.n.log FALSE <NA>
## D.chrs.pnct13.n.log FALSE <NA>
## D.chrs.uppr.n.log FALSE <NA>
## D.terms.post.stem.n.log FALSE <NA>
## D.terms.post.stop.n.log FALSE <NA>
## D.wrds.n.log FALSE <NA>
## D.wrds.unq.n.log FALSE <NA>
## sprice.d20nexp FALSE <NA>
## sprice.log10 FALSE <NA>
## sprice.root2 FALSE <NA>
## shapiro.test.p.value rsp_var_raw id_var rsp_var
## D.chrs.pnct11.n.log 1.131872e-48 FALSE NA NA
## D.chrs.pnct13.n.log 5.114801e-42 FALSE NA NA
## D.chrs.uppr.n.log 2.561871e-39 FALSE NA NA
## D.terms.post.stem.n.log 9.919506e-38 FALSE NA NA
## D.terms.post.stop.n.log 9.861611e-38 FALSE NA NA
## D.wrds.n.log 4.183094e-38 FALSE NA NA
## D.wrds.unq.n.log 9.919506e-38 FALSE NA NA
## sprice.d20nexp 5.382519e-55 FALSE NA NA
## sprice.log10 6.612030e-24 FALSE NA NA
## sprice.root2 7.924585e-02 FALSE NA NA
## max min max.sold.fctr.N
## D.chrs.pnct11.n.log 1.9459101 0.000000e+00 1.7917595
## D.chrs.pnct13.n.log 1.9459101 0.000000e+00 1.6094379
## D.chrs.uppr.n.log 4.4659081 0.000000e+00 4.4543473
## D.terms.post.stem.n.log 3.0445224 0.000000e+00 2.9444390
## D.terms.post.stop.n.log 3.0445224 0.000000e+00 2.9444390
## D.wrds.n.log 3.1780538 0.000000e+00 3.0910425
## D.wrds.unq.n.log 3.0445224 0.000000e+00 2.9444390
## sprice.d20nexp 0.8611384 1.929714e-22 0.8191402
## sprice.log10 6.9077453 1.095273e+00 6.8553877
## sprice.root2 31.6226185 1.729162e+00 30.8055190
## max.sold.fctr.Y min.sold.fctr.N min.sold.fctr.Y
## D.chrs.pnct11.n.log 1.3862944 0.000000e+00 0.000000e+00
## D.chrs.pnct13.n.log 1.3862944 0.000000e+00 0.000000e+00
## D.chrs.uppr.n.log 4.4543473 0.000000e+00 0.000000e+00
## D.terms.post.stem.n.log 2.9957323 0.000000e+00 0.000000e+00
## D.terms.post.stop.n.log 2.9957323 0.000000e+00 0.000000e+00
## D.wrds.n.log 3.0445224 0.000000e+00 0.000000e+00
## D.wrds.unq.n.log 2.9957323 0.000000e+00 0.000000e+00
## sprice.d20nexp 0.7909662 2.472641e-21 4.422439e-14
## sprice.log10 6.4216060 1.383791e+00 1.545433e+00
## sprice.root2 24.7989919 1.997498e+00 2.165641e+00
## max.sold.fctr.predict.Ensemble.glmnet.N
## D.chrs.pnct11.n.log 1.9459101
## D.chrs.pnct13.n.log 1.9459101
## D.chrs.uppr.n.log 4.4659081
## D.terms.post.stem.n.log 3.0445224
## D.terms.post.stop.n.log 3.0445224
## D.wrds.n.log 3.1354942
## D.wrds.unq.n.log 3.0445224
## sprice.d20nexp 0.8611384
## sprice.log10 6.9067548
## sprice.root2 31.6069613
## max.sold.fctr.predict.Ensemble.glmnet.Y
## D.chrs.pnct11.n.log 1.3862944
## D.chrs.pnct13.n.log 1.6094379
## D.chrs.uppr.n.log 4.4188406
## D.terms.post.stem.n.log 2.9957323
## D.terms.post.stop.n.log 2.9957323
## D.wrds.n.log 3.0445224
## D.wrds.unq.n.log 2.9957323
## sprice.d20nexp 0.3680634
## sprice.log10 6.3543700
## sprice.root2 23.9791576
## min.sold.fctr.predict.Ensemble.glmnet.N
## D.chrs.pnct11.n.log 0.000000e+00
## D.chrs.pnct13.n.log 0.000000e+00
## D.chrs.uppr.n.log 0.000000e+00
## D.terms.post.stem.n.log 0.000000e+00
## D.terms.post.stop.n.log 0.000000e+00
## D.wrds.n.log 0.000000e+00
## D.wrds.unq.n.log 0.000000e+00
## sprice.d20nexp 2.027639e-22
## sprice.log10 1.095273e+00
## sprice.root2 1.729162e+00
## min.sold.fctr.predict.Ensemble.glmnet.Y
## D.chrs.pnct11.n.log 0.000000e+00
## D.chrs.pnct13.n.log 0.000000e+00
## D.chrs.uppr.n.log 0.000000e+00
## D.terms.post.stem.n.log 0.000000e+00
## D.terms.post.stop.n.log 0.000000e+00
## D.wrds.n.log 0.000000e+00
## D.wrds.unq.n.log 0.000000e+00
## sprice.d20nexp 3.266131e-13
## sprice.log10 2.995232e+00
## sprice.root2 4.471018e+00
## max.sold.fctr.predict.Final.Ensemble.glmnet.N
## D.chrs.pnct11.n.log 1.3862944
## D.chrs.pnct13.n.log 1.9459101
## D.chrs.uppr.n.log 4.4659081
## D.terms.post.stem.n.log 2.9444390
## D.terms.post.stop.n.log 2.9444390
## D.wrds.n.log 3.1780538
## D.wrds.unq.n.log 2.9444390
## sprice.d20nexp 0.7538966
## sprice.log10 6.9077453
## sprice.root2 31.6226185
## max.sold.fctr.predict.Final.Ensemble.glmnet.Y
## D.chrs.pnct11.n.log 1.38629436
## D.chrs.pnct13.n.log 1.38629436
## D.chrs.uppr.n.log 4.35670883
## D.terms.post.stem.n.log 2.94443898
## D.terms.post.stop.n.log 2.94443898
## D.wrds.n.log 3.04452244
## D.wrds.unq.n.log 2.94443898
## sprice.d20nexp 0.08212605
## sprice.log10 6.32614947
## sprice.root2 23.64318084
## min.sold.fctr.predict.Final.Ensemble.glmnet.N
## D.chrs.pnct11.n.log 0.000000e+00
## D.chrs.pnct13.n.log 0.000000e+00
## D.chrs.uppr.n.log 0.000000e+00
## D.terms.post.stem.n.log 0.000000e+00
## D.terms.post.stop.n.log 0.000000e+00
## D.wrds.n.log 0.000000e+00
## D.wrds.unq.n.log 0.000000e+00
## sprice.d20nexp 1.929714e-22
## sprice.log10 1.731656e+00
## sprice.root2 2.376973e+00
## min.sold.fctr.predict.Final.Ensemble.glmnet.Y
## D.chrs.pnct11.n.log 0.000000e+00
## D.chrs.pnct13.n.log 0.000000e+00
## D.chrs.uppr.n.log 0.000000e+00
## D.terms.post.stem.n.log 0.000000e+00
## D.terms.post.stop.n.log 0.000000e+00
## D.wrds.n.log 0.000000e+00
## D.wrds.unq.n.log 0.000000e+00
## sprice.d20nexp 7.268909e-13
## sprice.log10 3.911823e+00
## sprice.root2 7.070361e+00
## [1] "OOBobs sold.fctr.predict.Ensemble.glmnet Y: max > max of Train range: 7"
## UniqueID sold.fctr.predict.Ensemble.glmnet D.chrs.pnct13.n.log
## 744 11443 Y 1.0986123
## 439 11018 Y 0.0000000
## 825 11549 Y 1.0986123
## 517 11135 Y 0.6931472
## 38 10045 Y 0.0000000
## 234 10562 Y 1.6094379
## 906 11702 Y 1.0986123
## D.wrds.stop.n.log
## 744 1.791759
## 439 2.302585
## 825 2.302585
## 517 1.791759
## 38 2.302585
## 234 1.098612
## 906 2.079442
## id cor.y exclude.as.feat
## D.chrs.pnct13.n.log D.chrs.pnct13.n.log -0.041480651 FALSE
## D.wrds.stop.n.log D.wrds.stop.n.log -0.001512415 FALSE
## cor.y.abs cor.high.X freqRatio percentUnique zeroVar
## D.chrs.pnct13.n.log 0.041480651 <NA> 4.664516 0.5911330 FALSE
## D.wrds.stop.n.log 0.001512415 <NA> 8.222222 0.8866995 FALSE
## nzv is.cor.y.abs.low interaction.feat
## D.chrs.pnct13.n.log FALSE FALSE <NA>
## D.wrds.stop.n.log FALSE TRUE <NA>
## shapiro.test.p.value rsp_var_raw id_var rsp_var
## D.chrs.pnct13.n.log 5.114801e-42 FALSE NA NA
## D.wrds.stop.n.log 1.206985e-42 FALSE NA NA
## max min max.sold.fctr.N max.sold.fctr.Y
## D.chrs.pnct13.n.log 1.945910 0 1.609438 1.386294
## D.wrds.stop.n.log 2.397895 0 2.302585 1.609438
## min.sold.fctr.N min.sold.fctr.Y
## D.chrs.pnct13.n.log 0 0
## D.wrds.stop.n.log 0 0
## max.sold.fctr.predict.Ensemble.glmnet.N
## D.chrs.pnct13.n.log 1.94591
## D.wrds.stop.n.log 1.94591
## max.sold.fctr.predict.Ensemble.glmnet.Y
## D.chrs.pnct13.n.log 1.609438
## D.wrds.stop.n.log 2.302585
## min.sold.fctr.predict.Ensemble.glmnet.N
## D.chrs.pnct13.n.log 0
## D.wrds.stop.n.log 0
## min.sold.fctr.predict.Ensemble.glmnet.Y
## D.chrs.pnct13.n.log 0
## D.wrds.stop.n.log 0
## max.sold.fctr.predict.Final.Ensemble.glmnet.N
## D.chrs.pnct13.n.log 1.945910
## D.wrds.stop.n.log 2.397895
## max.sold.fctr.predict.Final.Ensemble.glmnet.Y
## D.chrs.pnct13.n.log 1.386294
## D.wrds.stop.n.log 1.609438
## min.sold.fctr.predict.Final.Ensemble.glmnet.N
## D.chrs.pnct13.n.log 0
## D.wrds.stop.n.log 0
## min.sold.fctr.predict.Final.Ensemble.glmnet.Y
## D.chrs.pnct13.n.log 0
## D.wrds.stop.n.log 0
## [1] "OOBobs total range outliers: 17"
## [1] "newobs sold.fctr.predict.Final.Ensemble.glmnet N: min < min of Train range: 1"
## UniqueID sold.fctr.predict.Final.Ensemble.glmnet sprice.d20nexp
## 1411 12625 N 1.929714e-22
## id cor.y exclude.as.feat cor.y.abs
## sprice.d20nexp sprice.d20nexp 0.05068822 FALSE 0.05068822
## cor.high.X freqRatio percentUnique zeroVar nzv
## sprice.d20nexp <NA> 1.3125 46.2069 FALSE FALSE
## is.cor.y.abs.low interaction.feat shapiro.test.p.value
## sprice.d20nexp FALSE <NA> 5.382519e-55
## rsp_var_raw id_var rsp_var max min
## sprice.d20nexp FALSE NA NA 0.8611384 1.929714e-22
## max.sold.fctr.N max.sold.fctr.Y min.sold.fctr.N
## sprice.d20nexp 0.8611384 0.7909662 2.027639e-22
## min.sold.fctr.Y max.sold.fctr.predict.Ensemble.glmnet.N
## sprice.d20nexp 2.200702e-15 0.8611384
## max.sold.fctr.predict.Ensemble.glmnet.Y
## sprice.d20nexp 0.3680634
## min.sold.fctr.predict.Ensemble.glmnet.N
## sprice.d20nexp 2.027639e-22
## min.sold.fctr.predict.Ensemble.glmnet.Y
## sprice.d20nexp 3.266131e-13
## max.sold.fctr.predict.Final.Ensemble.glmnet.N
## sprice.d20nexp 0.7538966
## max.sold.fctr.predict.Final.Ensemble.glmnet.Y
## sprice.d20nexp 0.08212605
## min.sold.fctr.predict.Final.Ensemble.glmnet.N
## sprice.d20nexp 1.929714e-22
## min.sold.fctr.predict.Final.Ensemble.glmnet.Y
## sprice.d20nexp 7.268909e-13
## [1] "newobs sold.fctr.predict.Final.Ensemble.glmnet N: max > max of Train range: 2"
## UniqueID sold.fctr.predict.Final.Ensemble.glmnet D.wrds.n.log
## 1411 12625 N 2.833213
## 1416 12631 N 3.178054
## D.wrds.stop.n.log sprice.log10 sprice.root2
## 1411 2.397895 6.907745 31.62262
## 1416 2.302585 4.700389 10.48761
## id cor.y exclude.as.feat
## D.wrds.n.log D.wrds.n.log -0.029259294 FALSE
## D.wrds.stop.n.log D.wrds.stop.n.log -0.001512415 FALSE
## sprice.log10 sprice.log10 -0.150032840 FALSE
## sprice.root2 sprice.root2 -0.158256112 FALSE
## cor.y.abs cor.high.X freqRatio
## D.wrds.n.log 0.029259294 D.terms.post.stop.n.log 8.546875
## D.wrds.stop.n.log 0.001512415 <NA> 8.222222
## sprice.log10 0.150032840 sprice.root2 1.312500
## sprice.root2 0.158256112 <NA> 1.312500
## percentUnique zeroVar nzv is.cor.y.abs.low
## D.wrds.n.log 2.2660099 FALSE FALSE FALSE
## D.wrds.stop.n.log 0.8866995 FALSE FALSE TRUE
## sprice.log10 46.2068966 FALSE FALSE FALSE
## sprice.root2 46.2068966 FALSE FALSE FALSE
## interaction.feat shapiro.test.p.value rsp_var_raw id_var
## D.wrds.n.log <NA> 4.183094e-38 FALSE NA
## D.wrds.stop.n.log <NA> 1.206985e-42 FALSE NA
## sprice.log10 <NA> 6.612030e-24 FALSE NA
## sprice.root2 <NA> 7.924585e-02 FALSE NA
## rsp_var max min max.sold.fctr.N
## D.wrds.n.log NA 3.178054 0.000000 3.135494
## D.wrds.stop.n.log NA 2.397895 0.000000 2.302585
## sprice.log10 NA 6.907745 1.095273 6.906755
## sprice.root2 NA 31.622618 1.729162 31.606961
## max.sold.fctr.Y min.sold.fctr.N min.sold.fctr.Y
## D.wrds.n.log 3.135494 0.000000 0.000000
## D.wrds.stop.n.log 2.302585 0.000000 0.000000
## sprice.log10 6.514713 1.095273 1.545433
## sprice.root2 25.980762 1.729162 2.165641
## max.sold.fctr.predict.Ensemble.glmnet.N
## D.wrds.n.log 3.135494
## D.wrds.stop.n.log 1.945910
## sprice.log10 6.906755
## sprice.root2 31.606961
## max.sold.fctr.predict.Ensemble.glmnet.Y
## D.wrds.n.log 3.044522
## D.wrds.stop.n.log 2.302585
## sprice.log10 6.354370
## sprice.root2 23.979158
## min.sold.fctr.predict.Ensemble.glmnet.N
## D.wrds.n.log 0.000000
## D.wrds.stop.n.log 0.000000
## sprice.log10 1.095273
## sprice.root2 1.729162
## min.sold.fctr.predict.Ensemble.glmnet.Y
## D.wrds.n.log 0.000000
## D.wrds.stop.n.log 0.000000
## sprice.log10 2.995232
## sprice.root2 4.471018
## max.sold.fctr.predict.Final.Ensemble.glmnet.N
## D.wrds.n.log 3.178054
## D.wrds.stop.n.log 2.397895
## sprice.log10 6.907745
## sprice.root2 31.622618
## max.sold.fctr.predict.Final.Ensemble.glmnet.Y
## D.wrds.n.log 3.044522
## D.wrds.stop.n.log 1.609438
## sprice.log10 6.326149
## sprice.root2 23.643181
## min.sold.fctr.predict.Final.Ensemble.glmnet.N
## D.wrds.n.log 0.000000
## D.wrds.stop.n.log 0.000000
## sprice.log10 1.731656
## sprice.root2 2.376973
## min.sold.fctr.predict.Final.Ensemble.glmnet.Y
## D.wrds.n.log 0.000000
## D.wrds.stop.n.log 0.000000
## sprice.log10 3.911823
## sprice.root2 7.070361
## [1] "newobs total range outliers: 2"
out_fname <- paste0(glb_out_pfx, "out.csv")
write.csv(obsout_df, out_fname, quote=FALSE, row.names=FALSE)
#cat(" ", "\n", file=submit_fn, append=TRUE)
# print(orderBy(~ -max.auc.OOB, glb_models_df[, c("id",
# "max.auc.OOB", "max.Accuracy.OOB")]))
for (txt_var in glbFeatsText) {
# Print post-stem-words but need post-stop-words for debugging ?
print(sprintf(" All post-stem-words term weights for %s:", txt_var))
myprint_df(glb_post_stem_words_terms_df_lst[[txt_var]])
terms_mtrx <- glb_post_stem_words_terms_mtrx_lst[[txt_var]]
print(glb_allobs_df[
which(terms_mtrx[, tail(glb_post_stem_words_terms_df_lst[[txt_var]], 1)$pos] > 0),
c(glb_id_var, glbFeatsText)])
print(nrow(subset(glb_post_stem_words_terms_df_lst[[txt_var]], freq == 1)))
#print(glb_allobs_df[which(terms_mtrx[, 207] > 0), c(glb_id_var, glbFeatsText)])
#unlist(strsplit(glb_allobs_df[2157, "description"], ""))
#glb_allobs_df[2442, c(glb_id_var, glbFeatsText)]
#terms_mtrx[2442, terms_mtrx[2442, ] > 0]
print(sprintf(" All post-stem-words term freq distribution for %s:", txt_var))
print(table(glb_post_stem_words_terms_df_lst[[txt_var]]$freq))
print(sprintf(" All post-stem-words term length distribution for %s:", txt_var))
print(table(nchar(glb_post_stem_words_terms_df_lst[[txt_var]]$term)))
print(subset(glb_post_stem_words_terms_df_lst[[txt_var]], nchar(term) >= 10))
print(sprintf(" Analyzed term weights for %s:", txt_var))
tmp_df <- glb_post_stem_words_terms_df_lst[[txt_var]]
anl_terms_vctr <- union(select_terms, assoc_terms)
print(subset(tmp_df, term %in% anl_terms_vctr))
# tmp_freq1_df <- subset(tmp_df, freq == 1)
# tmp_freq1_df$top_n <- grepl(paste0(top_n_vctr, collapse="|"), tmp_freq1_df$term)
# print(subset(tmp_freq1_df, top_n == TRUE))
}
## [1] " All post-stem-words term weights for descr.my:"
## term weight freq pos cor.y cor.y.abs chisq.stat
## condit condit 82.65906 333 51 -0.013082326 0.013082326 39.43564
## likenew likenew 67.91742 46 116 -0.024950908 0.024950908 11.19258
## use use 59.22119 162 209 -0.006636197 0.006636197 32.24507
## in in 58.77724 291 97 -0.075073146 0.075073146 29.13620
## is is 51.07423 217 109 0.007630386 0.007630386 50.21794
## and and 49.98953 201 19 0.026439136 0.026439136 31.35854
## chisq.pval nzv.freqRatio nzv.percentUnique nzv weight.N
## condit 0.008703004 22.22857 2.167488 TRUE 45.49983
## likenew 0.738827061 196.60000 1.576355 TRUE 36.90375
## use 0.073312742 49.88889 2.266010 TRUE 33.13193
## in 0.141013206 22.55556 2.266010 TRUE 35.19219
## is 0.006114667 43.75000 2.857143 TRUE 25.75795
## and 0.089008983 54.25000 2.266010 TRUE 27.56305
## weight.Y weight.NA
## condit 11.341542 25.81769
## likenew 5.416672 25.59700
## use 8.493944 17.59532
## in 5.786290 17.79877
## is 7.373010 17.94327
## and 8.827670 13.59881
## term weight freq pos cor.y cor.y.abs
## icloud icloud 23.194737 37 95 -0.013440025 0.013440025
## been been 17.962689 47 30 -0.005901474 0.005901474
## profession profession 8.359546 16 159 -0.054379702 0.054379702
## shown shown 5.053924 6 181 -0.028275651 0.028275651
## sinc sinc 4.443334 9 183 -0.036531932 0.036531932
## usb usb 3.270302 5 208 0.037815608 0.037815608
## chisq.stat chisq.pval nzv.freqRatio nzv.percentUnique nzv
## icloud 14.5561626 0.1038713 198.4000 0.9852217 TRUE
## been 6.0409235 0.7358156 140.2857 0.9852217 TRUE
## profession 3.0240005 0.5538170 334.6667 0.4926108 TRUE
## shown 0.8182078 0.8451072 1012.0000 0.3940887 TRUE
## sinc 1.3663801 0.7134340 505.0000 0.3940887 TRUE
## usb 5.2010596 0.1576529 505.0000 0.3940887 TRUE
## weight.N weight.Y weight.NA
## icloud 11.566441 2.215075 9.413222
## been 10.317552 2.582351 5.062787
## profession 5.816299 0.000000 2.543247
## shown 1.793573 0.000000 3.260351
## sinc 2.602072 0.000000 1.841262
## usb 1.847279 1.423023 0.000000
## term weight freq pos cor.y cor.y.abs chisq.stat
## 4g 4g 1.5403968 2 9 -0.01637606 0.01637606 4.077526e-28
## gold gold 1.0488844 1 85 -0.01637606 0.01637606 4.077526e-28
## such such 0.9488844 2 193 0.08520829 0.08520829 3.427842e+00
## ipad4 ipad4 0.8740704 1 103 -0.01637606 0.01637606 4.077526e-28
## ipad1 ipad1 0.8068342 1 100 -0.01637606 0.01637606 4.077526e-28
## sprint sprint 0.7492032 1 190 -0.01637606 0.01637606 4.077526e-28
## chisq.pval nzv.freqRatio nzv.percentUnique nzv weight.N weight.Y
## 4g 1.0000000 1014.0 0.1970443 TRUE 0.6777746 0.0000000
## gold 1.0000000 1014.0 0.1970443 TRUE 1.0488844 0.0000000
## such 0.0641058 506.5 0.1970443 TRUE 0.0000000 0.9488844
## ipad4 1.0000000 1014.0 0.1970443 TRUE 0.8740704 0.0000000
## ipad1 1.0000000 1014.0 0.1970443 TRUE 0.8068342 0.0000000
## sprint 1.0000000 1014.0 0.1970443 TRUE 0.7492032 0.0000000
## weight.NA
## 4g 0.8626222
## gold 0.0000000
## such 0.0000000
## ipad4 0.0000000
## ipad1 0.0000000
## sprint 0.0000000
## UniqueID
## 572 11203
## descr.my
## 572 THIS IPAD LOOKS EXCELLENT !! IT IS UNLOCKED FOR DATA USAGE EXCEPT FOR SPRINT/TMOBILE. IT WILL WORK
## [1] 6
## [1] " All post-stem-words term freq distribution for descr.my:"
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 6 6 5 4 9 13 9 8 13 10 6 5 10 9 4 9 1 4
## 19 20 21 22 23 24 25 27 29 30 31 32 33 36 37 39 40 42
## 4 1 4 3 4 2 3 3 3 4 2 2 2 2 3 3 1 1
## 43 44 45 46 47 50 52 53 55 56 58 59 60 62 66 68 71 72
## 1 1 3 2 1 2 1 1 1 1 1 2 1 1 1 1 2 1
## 83 85 86 87 91 92 94 97 111 124 125 128 130 146 162 165 173 180
## 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 201 217 291 333
## 1 1 1 1
## [1] " All post-stem-words term length distribution for descr.my:"
##
## 1 2 3 4 5 6 7 8 9 10
## 6 17 28 64 51 29 14 8 6 2
## term weight freq pos cor.y cor.y.abs chisq.stat
## profession profession 8.359546 16 159 -0.0543797 0.0543797 3.0240005
## manufactur manufactur 2.543966 3 122 -0.0267640 0.0267640 0.8182078
## chisq.pval nzv.freqRatio nzv.percentUnique nzv weight.N
## profession 0.5538170 334.6667 0.4926108 TRUE 5.816299
## manufactur 0.6642452 506.0000 0.2955665 TRUE 2.543966
## weight.Y weight.NA
## profession 0 2.543247
## manufactur 0 0.000000
## [1] " Analyzed term weights for descr.my:"
## term weight freq pos cor.y cor.y.abs chisq.stat
## condit condit 82.659063 333 51 -0.013082326 0.013082326 39.435636
## is is 51.074233 217 109 0.007630386 0.007630386 50.217935
## has has 32.162810 97 90 0.081745956 0.081745956 33.705526
## or or 32.022568 87 143 -0.002368073 0.002368073 26.378345
## this this 30.522817 128 200 -0.008850439 0.008850439 22.897336
## it it 29.066748 85 110 0.090364752 0.090364752 35.774554
## of of 29.066447 91 138 0.096131208 0.096131208 44.220265
## some some 23.211051 56 188 0.044306219 0.044306219 42.315138
## have have 23.073982 60 91 0.017801186 0.017801186 16.636116
## devic devic 21.897804 68 62 0.020563983 0.020563983 25.017626
## for for 21.454474 37 75 0.070528147 0.070528147 28.135179
## wear wear 20.211045 66 215 0.040531333 0.040531333 23.933921
## but but 19.545236 55 37 0.011210903 0.011210903 23.980514
## been been 17.962689 47 30 -0.005901474 0.005901474 6.040924
## broken broken 17.764857 13 36 0.091379799 0.091379799 23.983012
## will will 16.782505 24 221 -0.032522829 0.032522829 7.627492
## from from 16.283112 45 77 -0.025989943 0.025989943 21.783490
## damag damag 15.546419 17 57 -0.005225674 0.005225674 6.555715
## sign sign 14.973659 45 182 0.079164120 0.079164120 27.819257
## list list 13.596052 39 117 -0.072729364 0.072729364 5.963707
## cover cover 9.933378 14 55 0.084037929 0.084037929 23.503035
## part part 9.282082 13 152 0.062899486 0.062899486 20.265857
## tear tear 8.832556 23 195 0.043092211 0.043092211 18.344342
## smart smart 5.693947 8 186 0.021146840 0.021146840 12.133318
## chisq.pval nzv.freqRatio nzv.percentUnique nzv weight.N
## condit 0.0087030041 22.22857 2.1674877 TRUE 45.499832
## is 0.0061146672 43.75000 2.8571429 TRUE 25.757951
## has 0.0059513741 95.10000 1.6748768 TRUE 13.555539
## or 0.0342290454 50.15789 1.5763547 TRUE 18.694978
## this 0.0863440671 54.88235 1.5763547 TRUE 15.936326
## it 0.0112487436 137.00000 1.9704433 TRUE 11.405740
## of 0.0003178019 95.10000 1.7733990 TRUE 12.640390
## some 0.0002009765 195.20000 1.5763547 TRUE 11.322431
## have 0.1191094302 139.71429 1.1822660 TRUE 10.377318
## devic 0.0090626729 48.25000 1.1822660 TRUE 12.120523
## for 0.0086649283 198.00000 1.3793103 TRUE 8.152849
## wear 0.0207660543 138.71429 1.2807882 TRUE 9.703063
## but 0.0204655165 121.87500 1.2807882 TRUE 10.928261
## been 0.7358156001 140.28571 0.9852217 TRUE 10.317552
## broken 0.0011472108 251.00000 0.7881773 TRUE 2.232755
## will 0.4706766196 249.75000 0.8866995 TRUE 9.561412
## from 0.0095910743 51.73684 0.9852217 TRUE 9.595373
## damag 0.3638926683 251.25000 0.6896552 TRUE 7.244347
## sign 0.0058788088 140.28571 1.2807882 TRUE 6.530079
## list 0.1133893930 51.94737 0.3940887 TRUE 9.375325
## cover 0.0051601614 502.00000 0.9852217 TRUE 3.560457
## part 0.0024831549 251.25000 0.6896552 TRUE 3.425815
## tear 0.0105093224 250.25000 0.7881773 TRUE 3.472475
## smart 0.0330059785 336.00000 0.5911330 TRUE 3.389687
## weight.Y weight.NA
## condit 11.3415423 25.8176892
## is 7.3730103 17.9432713
## has 7.4574197 11.1498515
## or 4.9519805 8.3756099
## this 4.0061123 10.5803784
## it 7.8231782 9.8378302
## of 7.7622170 8.6638393
## some 5.2427666 6.6458539
## have 3.5790666 9.1175980
## devic 4.1059479 5.6713334
## for 6.2326144 7.0690109
## wear 4.0934743 6.4145076
## but 3.4402531 5.1767214
## been 2.5823507 5.0627869
## broken 11.4590591 4.0730428
## will 0.9839803 6.2371122
## from 1.7396791 4.9480602
## damag 1.7070351 6.5950367
## sign 4.2839150 4.1596655
## list 0.2738654 3.9468624
## cover 4.9519708 1.4209502
## part 3.9262301 1.9300366
## tear 2.0307874 3.3292932
## smart 1.7281948 0.5760649
if (glb_is_classification && glb_is_binomial)
print(glb_models_df[glb_models_df$id == glb_sel_mdl_id,
"opt.prob.threshold.OOB"])
## [1] 0.4
print(sprintf("glb_sel_mdl_id: %s", glb_sel_mdl_id))
## [1] "glb_sel_mdl_id: Ensemble.glmnet"
print(sprintf("glb_fin_mdl_id: %s", glb_fin_mdl_id))
## [1] "glb_fin_mdl_id: Final.Ensemble.glmnet"
get_dsp_models_df()
## [1] "Cross Validation issues:"
## Warning in get_dsp_models_df(): Cross Validation issues:
## MFO.myMFO_classfr Random.myrandom_classfr
## 0 0
## Max.cor.Y.rcv.1X1.glmnet Max.cor.Y.rcv.1X1.cp.0.rpart
## 0 0
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing missing values (geom_point).
## id
## Max.cor.Y.rcv.1X1.glmnet Max.cor.Y.rcv.1X1.glmnet
## Max.cor.Y.rcv.5X1.glmnet Max.cor.Y.rcv.5X1.glmnet
## Max.cor.Y.rcv.5X3.glmnet Max.cor.Y.rcv.5X3.glmnet
## Max.cor.Y.rcv.5X5.glmnet Max.cor.Y.rcv.5X5.glmnet
## Max.cor.Y.rcv.1X1.spatialSign.glmnet Max.cor.Y.rcv.1X1.spatialSign.glmnet
## Interact.High.cor.Y.glmnet Interact.High.cor.Y.glmnet
## RFE.X.glm RFE.X.glm
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet Max.cor.Y.rcv.1X1.YeoJohnson.glmnet
## Max.cor.Y.rcv.3X3.glmnet Max.cor.Y.rcv.3X3.glmnet
## Max.cor.Y.rcv.1X1.Interact.glmnet Max.cor.Y.rcv.1X1.Interact.glmnet
## Max.cor.Y.rcv.1X1.center.scale.glmnet Max.cor.Y.rcv.1X1.center.scale.glmnet
## Max.cor.Y.rcv.1X1.range.glmnet Max.cor.Y.rcv.1X1.range.glmnet
## Max.cor.Y.rcv.3X1.glmnet Max.cor.Y.rcv.3X1.glmnet
## Max.cor.Y.rcv.3X5.glmnet Max.cor.Y.rcv.3X5.glmnet
## RFE.X.glmnet RFE.X.glmnet
## All.X.glmnet All.X.glmnet
## Low.cor.X.glmnet Low.cor.X.glmnet
## RFE.X.bayesglm RFE.X.bayesglm
## Max.cor.Y.rcv.1X1.pca.glmnet Max.cor.Y.rcv.1X1.pca.glmnet
## Max.cor.Y.rcv.1X1.ica.glmnet Max.cor.Y.rcv.1X1.ica.glmnet
## RFE.X.gbm RFE.X.gbm
## RFE.X.avNNet RFE.X.avNNet
## RFE.X.rf RFE.X.rf
## Ensemble.glmnet Ensemble.glmnet
## Max.cor.Y.rcv.1X1.cp.0.rpart Max.cor.Y.rcv.1X1.cp.0.rpart
## RFE.X.nnet RFE.X.nnet
## RFE.X.earth RFE.X.earth
## RFE.X.rpart RFE.X.rpart
## Max.cor.Y.rpart Max.cor.Y.rpart
## MFO.myMFO_classfr MFO.myMFO_classfr
## Random.myrandom_classfr Random.myrandom_classfr
## Final.Ensemble.glmnet Final.Ensemble.glmnet
## Final.Ensemble.glm Final.Ensemble.glm
## RFE.X.Train.glmnet RFE.X.Train.glmnet
## RFE.X.Train.avNNet RFE.X.Train.avNNet
## RFE.X.Train.earth RFE.X.Train.earth
## RFE.X.Train.nnet RFE.X.Train.nnet
## RFE.X.Train.rpart RFE.X.Train.rpart
## RFE.X.Train.bayesglm RFE.X.Train.bayesglm
## RFE.X.Train.glm RFE.X.Train.glm
## max.AUCROCR.OOB max.AUCpROC.OOB
## Max.cor.Y.rcv.1X1.glmnet 0.7423189 0.5529412
## Max.cor.Y.rcv.5X1.glmnet 0.7398882 0.5497326
## Max.cor.Y.rcv.5X3.glmnet 0.7398882 0.5497326
## Max.cor.Y.rcv.5X5.glmnet 0.7398882 0.5497326
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.7264706 0.5187166
## Interact.High.cor.Y.glmnet 0.7259844 0.5187166
## RFE.X.glm 0.7157997 0.6304813
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.7150705 0.5187166
## Max.cor.Y.rcv.3X3.glmnet 0.7148517 0.5187166
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.7148517 0.5187166
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.7148517 0.5187166
## Max.cor.Y.rcv.1X1.range.glmnet 0.7148517 0.5187166
## Max.cor.Y.rcv.3X1.glmnet 0.7148517 0.5187166
## Max.cor.Y.rcv.3X5.glmnet 0.7148517 0.5187166
## RFE.X.glmnet 0.7120929 0.5406417
## All.X.glmnet 0.7120929 0.5406417
## Low.cor.X.glmnet 0.7107560 0.5406417
## RFE.X.bayesglm 0.7037555 0.6000000
## Max.cor.Y.rcv.1X1.pca.glmnet 0.7029655 0.5529412
## Max.cor.Y.rcv.1X1.ica.glmnet 0.6876276 0.5000000
## RFE.X.gbm 0.6834346 0.5368984
## RFE.X.avNNet 0.6544361 0.5810160
## RFE.X.rf 0.6501823 0.5000000
## Ensemble.glmnet 0.6417477 0.6272727
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.6280870 0.5914439
## RFE.X.nnet 0.5818060 0.5810160
## RFE.X.earth 0.5719251 0.5719251
## RFE.X.rpart 0.5000000 0.5000000
## Max.cor.Y.rpart 0.5000000 0.5000000
## MFO.myMFO_classfr 0.5000000 0.5000000
## Random.myrandom_classfr 0.4962567 0.5168449
## Final.Ensemble.glmnet NA NA
## Final.Ensemble.glm NA NA
## RFE.X.Train.glmnet NA NA
## RFE.X.Train.avNNet NA NA
## RFE.X.Train.earth NA NA
## RFE.X.Train.nnet NA NA
## RFE.X.Train.rpart NA NA
## RFE.X.Train.bayesglm NA NA
## RFE.X.Train.glm NA NA
## max.Accuracy.OOB max.Accuracy.fit
## Max.cor.Y.rcv.1X1.glmnet 0.6880165 0.7118644
## Max.cor.Y.rcv.5X1.glmnet 0.6797521 0.8154583
## Max.cor.Y.rcv.5X3.glmnet 0.6797521 0.8123155
## Max.cor.Y.rcv.5X5.glmnet 0.6797521 0.8101741
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.6590909 0.8079246
## Interact.High.cor.Y.glmnet 0.6508264 0.8098079
## RFE.X.glm 0.7314050 0.7407370
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.6776860 0.8110598
## Max.cor.Y.rcv.3X3.glmnet 0.6776860 0.8110598
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.6776860 0.8110598
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.6776860 0.8110598
## Max.cor.Y.rcv.1X1.range.glmnet 0.6776860 0.8110598
## Max.cor.Y.rcv.3X1.glmnet 0.6776860 0.8098207
## Max.cor.Y.rcv.3X5.glmnet 0.6776860 0.8098088
## RFE.X.glmnet 0.6652893 0.8054348
## All.X.glmnet 0.6652893 0.8041793
## Low.cor.X.glmnet 0.6673554 0.8048071
## RFE.X.bayesglm 0.7231405 0.7815306
## Max.cor.Y.rcv.1X1.pca.glmnet 0.6487603 0.8114577
## Max.cor.Y.rcv.1X1.ica.glmnet 0.6528926 0.8010051
## RFE.X.gbm 0.6239669 0.7978522
## RFE.X.avNNet 0.7417355 0.8003774
## RFE.X.rf 0.2272727 0.7984934
## Ensemble.glmnet 0.7314050 0.9422608
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.7190083 0.8097928
## RFE.X.nnet 0.2272727 0.8022536
## RFE.X.earth 0.2272727 0.7941069
## RFE.X.rpart 0.2272727 0.7941175
## Max.cor.Y.rpart 0.2272727 0.7897092
## MFO.myMFO_classfr 0.2272727 0.2015066
## Random.myrandom_classfr 0.2272727 0.2015066
## Final.Ensemble.glmnet NA 0.9044836
## Final.Ensemble.glm NA 0.9015172
## RFE.X.Train.glmnet NA 0.7944209
## RFE.X.Train.avNNet NA 0.7898177
## RFE.X.Train.earth NA 0.7848935
## RFE.X.Train.nnet NA 0.7839073
## RFE.X.Train.rpart NA 0.7822607
## RFE.X.Train.bayesglm NA 0.7783198
## RFE.X.Train.glm NA 0.7727557
## opt.prob.threshold.fit
## Max.cor.Y.rcv.1X1.glmnet 0.2
## Max.cor.Y.rcv.5X1.glmnet 0.2
## Max.cor.Y.rcv.5X3.glmnet 0.2
## Max.cor.Y.rcv.5X5.glmnet 0.2
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.2
## Interact.High.cor.Y.glmnet 0.2
## RFE.X.glm 0.3
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.2
## Max.cor.Y.rcv.3X3.glmnet 0.2
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.2
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.2
## Max.cor.Y.rcv.1X1.range.glmnet 0.2
## Max.cor.Y.rcv.3X1.glmnet 0.2
## Max.cor.Y.rcv.3X5.glmnet 0.2
## RFE.X.glmnet 0.2
## All.X.glmnet 0.2
## Low.cor.X.glmnet 0.2
## RFE.X.bayesglm 0.3
## Max.cor.Y.rcv.1X1.pca.glmnet 0.2
## Max.cor.Y.rcv.1X1.ica.glmnet 0.2
## RFE.X.gbm 0.2
## RFE.X.avNNet 0.3
## RFE.X.rf 0.1
## Ensemble.glmnet 0.4
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.3
## RFE.X.nnet 0.4
## RFE.X.earth 0.1
## RFE.X.rpart 0.2
## Max.cor.Y.rpart 0.2
## MFO.myMFO_classfr 0.2
## Random.myrandom_classfr 0.2
## Final.Ensemble.glmnet 0.4
## Final.Ensemble.glm 0.3
## RFE.X.Train.glmnet 0.2
## RFE.X.Train.avNNet 0.3
## RFE.X.Train.earth 0.3
## RFE.X.Train.nnet 0.3
## RFE.X.Train.rpart 0.2
## RFE.X.Train.bayesglm 0.3
## RFE.X.Train.glm 0.3
## opt.prob.threshold.OOB
## Max.cor.Y.rcv.1X1.glmnet 0.2
## Max.cor.Y.rcv.5X1.glmnet 0.2
## Max.cor.Y.rcv.5X3.glmnet 0.2
## Max.cor.Y.rcv.5X5.glmnet 0.2
## Max.cor.Y.rcv.1X1.spatialSign.glmnet 0.2
## Interact.High.cor.Y.glmnet 0.2
## RFE.X.glm 0.3
## Max.cor.Y.rcv.1X1.YeoJohnson.glmnet 0.2
## Max.cor.Y.rcv.3X3.glmnet 0.2
## Max.cor.Y.rcv.1X1.Interact.glmnet 0.2
## Max.cor.Y.rcv.1X1.center.scale.glmnet 0.2
## Max.cor.Y.rcv.1X1.range.glmnet 0.2
## Max.cor.Y.rcv.3X1.glmnet 0.2
## Max.cor.Y.rcv.3X5.glmnet 0.2
## RFE.X.glmnet 0.2
## All.X.glmnet 0.2
## Low.cor.X.glmnet 0.2
## RFE.X.bayesglm 0.3
## Max.cor.Y.rcv.1X1.pca.glmnet 0.2
## Max.cor.Y.rcv.1X1.ica.glmnet 0.2
## RFE.X.gbm 0.2
## RFE.X.avNNet 0.3
## RFE.X.rf 0.0
## Ensemble.glmnet 0.4
## Max.cor.Y.rcv.1X1.cp.0.rpart 0.2
## RFE.X.nnet 0.0
## RFE.X.earth 0.1
## RFE.X.rpart 0.2
## Max.cor.Y.rpart 0.2
## MFO.myMFO_classfr 0.2
## Random.myrandom_classfr 0.2
## Final.Ensemble.glmnet NA
## Final.Ensemble.glm NA
## RFE.X.Train.glmnet NA
## RFE.X.Train.avNNet NA
## RFE.X.Train.earth NA
## RFE.X.Train.nnet NA
## RFE.X.Train.rpart NA
## RFE.X.Train.bayesglm NA
## RFE.X.Train.glm NA
if (glb_is_regression) {
print(sprintf("%s OOB RMSE: %0.4f", glb_sel_mdl_id,
glb_models_df[glb_models_df$id == glb_sel_mdl_id, "min.RMSE.OOB"]))
if (!is.null(glb_category_var)) {
#stop(here"); glb_to_sav(); glb_ctgry_df <- sav_ctgry_df
# OOB_ctgry_df <- myget_category_stats(glb_OOBobs_df, glb_sel_mdl_id, "OOB")
# glb_ctgry_df <- merge(glb_ctgry_df, subset(OOB_ctgry_df, select=-.n.OOB),
# by=glb_category_var, all=TRUE)
#
# #glb_fitobs_df <- glb_get_predictions(glb_fitobs_df, glb_sel_mdl_id, glb_rsp_var_out)
# glb_ctgry_df <- merge(glb_ctgry_df,
# myget_category_stats(obs_df=glb_fitobs_df, mdl_id=glb_sel_mdl_id, label="fit"),
# by=glb_category_var, all=TRUE)
# row.names(glb_ctgry_df) <- glb_ctgry_df[, glb_category_var]
glb_ctgry_df <- merge(glb_ctgry_df,
myget_category_stats(obs_df=glb_trnobs_df, mdl_id=glb_fin_mdl_id, label="trn"),
by=glb_category_var, all=TRUE)
row.names(glb_ctgry_df) <- glb_ctgry_df[, glb_category_var]
glb_ctgry_df <- merge(glb_ctgry_df,
myget_category_stats(obs_df=glb_newobs_df, mdl_id=glb_fin_mdl_id, label="new"),
by=glb_category_var, all=TRUE)
row.names(glb_ctgry_df) <- glb_ctgry_df[, glb_category_var]
if (any(grepl("OOB", glbMdlMetricsEval)))
print(orderBy(~-err.abs.OOB.mean, glb_ctgry_df)) else
print(orderBy(~-err.abs.fit.mean, glb_ctgry_df))
print(colSums(glb_ctgry_df[, -grep(glb_category_var, names(glb_ctgry_df))]))
}
if ((glb_rsp_var %in% names(glb_newobs_df)) &&
!(any(is.na(glb_newobs_df[, glb_rsp_var])))) {
pred_stats_df <-
mypredict_mdl(mdl=glb_models_lst[[glb_fin_mdl_id]],
df=glb_newobs_df,
rsp_var=glb_rsp_var,
rsp_var_out=glb_rsp_var_out,
mdl_id=glb_fin_mdl_id,
label="new",
model_summaryFunction=glb_sel_mdl$control$summaryFunction,
model_metric=glb_sel_mdl$metric,
model_metric_maximize=glb_sel_mdl$maximize,
ret_type="stats")
print(sprintf("%s prediction stats for glb_newobs_df:", glb_fin_mdl_id))
print(pred_stats_df)
}
}
if (glb_is_classification) {
print(sprintf("%s OOB confusion matrix & accuracy: ", glb_sel_mdl_id))
print(t(confusionMatrix(glb_OOBobs_df[, paste0(glb_rsp_var_out, glb_sel_mdl_id)],
glb_OOBobs_df[, glb_rsp_var])$table))
if (!is.null(glb_category_var)) {
glb_ctgry_df <- merge(glb_ctgry_df,
myget_category_stats(obs_df=glb_trnobs_df, mdl_id=glb_fin_mdl_id, label="trn"),
by=glb_category_var, all=TRUE)
row.names(glb_ctgry_df) <- glb_ctgry_df[, glb_category_var]
glb_ctgry_df <- merge(glb_ctgry_df,
myget_category_stats(obs_df=glb_newobs_df, mdl_id=glb_fin_mdl_id, label="new"),
by=glb_category_var, all=TRUE)
row.names(glb_ctgry_df) <- glb_ctgry_df[, glb_category_var]
if (any(grepl("OOB", glbMdlMetricsEval)))
print(orderBy(~-err.abs.OOB.mean, glb_ctgry_df)) else
print(orderBy(~-err.abs.fit.mean, glb_ctgry_df))
print(colSums(glb_ctgry_df[, -grep(glb_category_var, names(glb_ctgry_df))]))
# print("Top category OOB errors:")
# print(glb_OOBobs_df[(glb_OOBobs_df[, glb_category_var] ==
# dsp_ctgry_df[1, glb_category_var]) &
# !glb_OOBobs_df[, predct_accurate_var_name],
# c(glb_id_var, glb_rsp_var_raw, paste0(glb_rsp_var_out, glb_sel_mdl_id),
# glb_category_var,
# row.names(head(myget_feats_importance(glb_sel_mdl), 5)),
# # "biddable", "startprice", "condition",
# glbFeatsText)])
}
if ((glb_rsp_var %in% names(glb_newobs_df)) &&
!(any(is.na(glb_newobs_df[, glb_rsp_var])))) {
print(sprintf("%s new confusion matrix & accuracy: ", glb_fin_mdl_id))
print(t(confusionMatrix(glb_newobs_df[, paste0(glb_rsp_var_out, glb_fin_mdl_id)],
glb_newobs_df[, glb_rsp_var])$table))
}
}
## [1] "Ensemble.glmnet OOB confusion matrix & accuracy: "
## Prediction
## Reference N Y
## N 300 74
## Y 56 54
## prdl.descr.my.fctr .n.OOB .n.Tst .freqRatio.Tst .freqRatio.OOB
## iPadmini3#0 iPadmini3#0 21 18 0.04265403 0.04338843
## iPadmini2#1 iPadmini2#1 9 8 0.01895735 0.01859504
## iPad2#0 iPad2#0 35 31 0.07345972 0.07231405
## iPad4#0 iPad4#0 21 18 0.04265403 0.04338843
## Unknown#0 Unknown#0 35 31 0.07345972 0.07231405
## Unknown#1 Unknown#1 26 23 0.05450237 0.05371901
## iPad1#0 iPad1#0 18 16 0.03791469 0.03719008
## iPadAir2#0 iPadAir2#0 32 28 0.06635071 0.06611570
## iPadmini2#0 iPadmini2#0 17 14 0.03317536 0.03512397
## iPad1#1 iPad1#1 26 22 0.05213270 0.05371901
## iPad2#1 iPad2#1 59 53 0.12559242 0.12190083
## iPadmini#0 iPadmini#0 30 26 0.06161137 0.06198347
## iPad3#1 iPad3#1 23 20 0.04739336 0.04752066
## iPad4#1 iPad4#1 27 24 0.05687204 0.05578512
## iPadmini#1 iPadmini#1 31 26 0.06161137 0.06404959
## iPadAir#1 iPadAir#1 22 19 0.04502370 0.04545455
## iPad3#0 iPad3#0 14 12 0.02843602 0.02892562
## iPadAir#0 iPadAir#0 22 19 0.04502370 0.04545455
## iPadmini3#1 iPadmini3#1 7 6 0.01421801 0.01446281
## iPadAir2#1 iPadAir2#1 9 8 0.01895735 0.01859504
## err.abs.fit.sum err.abs.fit.mean .n.fit err.abs.OOB.sum
## iPadmini3#0 2.26890314 0.070903223 32 9.080860
## iPadmini2#1 0.90301012 0.082091829 11 3.602456
## iPad2#0 1.46974670 0.077355089 19 12.909094
## iPad4#0 2.70907924 0.193505660 14 7.564998
## Unknown#0 4.82909486 0.150909214 32 12.443978
## Unknown#1 3.11621011 0.115415189 27 8.854515
## iPad1#0 4.03995924 0.109188088 37 5.970610
## iPadAir2#0 5.52284739 0.117507391 47 9.281717
## iPadmini2#0 2.82227032 0.117594597 24 4.856905
## iPad1#1 1.45599724 0.055999894 26 7.231008
## iPad2#1 1.35982569 0.041206839 33 16.002257
## iPadmini#0 5.95122836 0.097561121 61 7.743971
## iPad3#1 4.00207996 0.117708234 34 5.782836
## iPad4#1 1.47089112 0.047448101 31 6.720209
## iPadmini#1 1.28096147 0.067419025 19 7.440098
## iPadAir#1 0.58445407 0.018853357 31 4.660690
## iPad3#0 0.04356988 0.008713976 5 2.610921
## iPadAir#0 2.81582512 0.090833068 31 4.033037
## iPadmini3#1 NA NA NA 1.029511
## iPadAir2#1 1.20189395 0.070699644 17 1.128339
## err.abs.OOB.mean err.abs.trn.sum err.abs.trn.mean .n.trn
## iPadmini3#0 0.4324219 9.125996 0.17218860 53
## iPadmini2#1 0.4002729 3.695516 0.18477579 20
## iPad2#0 0.3688313 7.432295 0.13763508 54
## iPad4#0 0.3602380 5.685116 0.16243188 35
## Unknown#0 0.3555422 13.297555 0.19847097 67
## Unknown#1 0.3405583 9.749837 0.18395918 53
## iPad1#0 0.3317006 10.315132 0.18754786 55
## iPadAir2#0 0.2900537 13.024684 0.16486942 79
## iPadmini2#0 0.2857003 6.294523 0.15352494 41
## iPad1#1 0.2781157 3.843990 0.07392289 52
## iPad2#1 0.2712247 12.881867 0.14002029 92
## iPadmini#0 0.2581324 13.425080 0.14752835 91
## iPad3#1 0.2514277 9.726148 0.17063417 57
## iPad4#1 0.2488966 7.757177 0.13374442 58
## iPadmini#1 0.2400032 6.519391 0.13038782 50
## iPadAir#1 0.2118495 6.198915 0.11696066 53
## iPad3#0 0.1864944 2.927842 0.15409696 19
## iPadAir#0 0.1833198 7.695518 0.14519845 53
## iPadmini3#1 0.1470729 1.105782 0.15796881 7
## iPadAir2#1 0.1253710 4.459238 0.17150916 26
## err.abs.new.sum err.abs.new.mean .n.new
## iPadmini3#0 NA NA 18
## iPadmini2#1 NA NA 8
## iPad2#0 NA NA 31
## iPad4#0 NA NA 18
## Unknown#0 NA NA 31
## Unknown#1 NA NA 23
## iPad1#0 NA NA 16
## iPadAir2#0 NA NA 28
## iPadmini2#0 NA NA 14
## iPad1#1 NA NA 22
## iPad2#1 NA NA 53
## iPadmini#0 NA NA 26
## iPad3#1 NA NA 20
## iPad4#1 NA NA 24
## iPadmini#1 NA NA 26
## iPadAir#1 NA NA 19
## iPad3#0 NA NA 12
## iPadAir#0 NA NA 19
## iPadmini3#1 NA NA 6
## iPadAir2#1 NA NA 8
## .n.OOB .n.Tst .freqRatio.Tst .freqRatio.OOB
## 484.000000 422.000000 1.000000 1.000000
## err.abs.fit.sum err.abs.fit.mean .n.fit err.abs.OOB.sum
## NA NA NA 138.948009
## err.abs.OOB.mean err.abs.trn.sum err.abs.trn.mean .n.trn
## 5.567227 155.161599 3.087376 1015.000000
## err.abs.new.sum err.abs.new.mean .n.new
## NA NA 422.000000
if (!is.null(glb_featsimp_df))
print(orderBy(as.formula(paste0("~ -", glb_sel_mdl_id, ".importance")),
subset(glb_featsimp_df, importance > 10)))
## Ensemble.glmnet.importance
## sold.fctr.predict.RFE.X.Train.avNNet.prob NA
## sold.fctr.predict.RFE.X.Train.nnet.prob NA
## sold.fctr.predict.RFE.X.Train.earth.prob NA
## sold.fctr.predict.RFE.X.Train.glm.prob NA
## sold.fctr.predict.RFE.X.Train.bayesglm.prob NA
## sold.fctr.predict.RFE.X.Train.rpart.prob NA
## importance
## sold.fctr.predict.RFE.X.Train.avNNet.prob 100.00000
## sold.fctr.predict.RFE.X.Train.nnet.prob 58.86597
## sold.fctr.predict.RFE.X.Train.earth.prob 56.79193
## sold.fctr.predict.RFE.X.Train.glm.prob 50.63200
## sold.fctr.predict.RFE.X.Train.bayesglm.prob 47.39083
## sold.fctr.predict.RFE.X.Train.rpart.prob 47.39083
## Final.Ensemble.glmnet.importance
## sold.fctr.predict.RFE.X.Train.avNNet.prob 100.00000
## sold.fctr.predict.RFE.X.Train.nnet.prob 58.86597
## sold.fctr.predict.RFE.X.Train.earth.prob 56.79193
## sold.fctr.predict.RFE.X.Train.glm.prob 50.63200
## sold.fctr.predict.RFE.X.Train.bayesglm.prob 47.39083
## sold.fctr.predict.RFE.X.Train.rpart.prob 47.39083
print("glb_newobs_df prediction stats:")
## [1] "glb_newobs_df prediction stats:"
if (glb_is_regression)
print(myplot_histogram(glb_newobs_df, paste0(glb_rsp_var_out, glb_fin_mdl_id)))
if (glb_is_classification)
print(table(glb_newobs_df[, paste0(glb_rsp_var_out, glb_fin_mdl_id)]))
##
## N Y
## 352 70
# Use this to see how prediction changes by changing one or more values
# players_df <- data.frame(id=c("Chavez", "Giambi", "Menechino", "Myers", "Pena"),
# OBP=c(0.338, 0.391, 0.369, 0.313, 0.361),
# SLG=c(0.540, 0.450, 0.374, 0.447, 0.500),
# cost=c(1400000, 1065000, 295000, 800000, 300000))
# players_df$RS.predict <- predict(glb_models_lst[[csm_mdl_id]], players_df)
# print(orderBy(~ -RS.predict, players_df))
# dsp_chisq.test(Headline.contains="[Vi]deo")
if ((length(diff <- setdiff(names(glb_trnobs_df), names(glb_allobs_df))) > 0) ||
(length(diff <- setdiff(names(glb_fitobs_df), names(glb_allobs_df))) > 0) ||
(length(diff <- setdiff(names(glb_OOBobs_df), names(glb_allobs_df))) > 0) ||
(length(diff <- setdiff(names(glb_newobs_df), names(glb_allobs_df))) > 0)) {
print(diff)
stop("glb_*obs_df not in sync")
}
if (glb_save_envir)
save(glb_feats_df, glb_allobs_df,
#glb_trnobs_df, glb_fitobs_df, glb_OOBobs_df, glb_newobs_df,
glb_models_df, dsp_models_df, glb_models_lst, glb_model_type,
glb_sel_mdl, glb_sel_mdl_id,
glb_fin_mdl, glb_fin_mdl_id,
file = paste0(glb_out_pfx, "prdnew_dsk.RData"))
sav_fin_mdl <- glb_fin_mdl; sav_sel_mdl <- glb_sel_mdl
#save(sav_fin_mdl, sav_sel_mdl, file=paste0(glb_out_pfx, "sav_mdl.RData"))
# load(file=paste0(glb_out_pfx, "sav_mdl_01.RData"), verbose=TRUE)
# prv_fin_mdl <- sav_fin_mdl; prv_sel_mdl <- sav_sel_mdl
# load(file=paste0(glb_out_pfx, "sav_mdl.RData"), verbose=TRUE)
# cur_fin_mdl <- sav_fin_mdl; cur_sel_mdl <- sav_sel_mdl
# all.equal(cur_fin_mdl, prv_fin_mdl)
# cur_fitobs_df <- cur_fin_mdl$trainingData; prv_fitobs_df <- prv_fin_mdl$trainingData; all.equal(cur_fitobs_df, prv_fitobs_df)
# nrow(cur_fitobs_df); nrow(prv_fitobs_df)
# names(cur_fitobs_df); names(prv_fitobs_df)
# all.equal(cur_fin_mdl$bestTune, prv_fin_mdl$bestTune)
# all.equal(glb_sel_mdl, sav_sel_mdl)
# cur_fitobs_df <- cur_sel_mdl$trainingData; prv_fitobs_df <- prv_sel_mdl$trainingData; all.equal(cur_fitobs_df, prv_fitobs_df)
# head(myget_feats_importance(glb_sel_mdl)); head(myget_feats_importance(sav_sel_mdl))
# head(myget_feats_importance(cur_sel_mdl)); head(myget_feats_importance(prv_sel_mdl))
# tmp_replay_lst <- replay.petrisim(pn=glb_analytics_pn,
# replay.trans=(glb_analytics_avl_objs <- c(glb_analytics_avl_objs,
# "data.new.prediction")), flip_coord=TRUE)
# print(ggplot.petrinet(tmp_replay_lst[["pn"]]) + coord_flip())
glb_chunks_df <- myadd_chunk(glb_chunks_df, "display.session.info", major.inc=TRUE)
## label step_major step_minor label_minor bgn end
## 16 predict.data.new 8 0 0 927.165 956.162
## 17 display.session.info 9 0 0 956.163 NA
## elapsed
## 16 28.997
## 17 NA
Null Hypothesis (\(\sf{H_{0}}\)): mpg is not impacted by am_fctr.
The variance by am_fctr appears to be independent. #{r q1, cache=FALSE} # print(t.test(subset(cars_df, am_fctr == "automatic")$mpg, # subset(cars_df, am_fctr == "manual")$mpg, # var.equal=FALSE)$conf) # We reject the null hypothesis i.e. we have evidence to conclude that am_fctr impacts mpg (95% confidence). Manual transmission is better for miles per gallon versus automatic transmission.
## label step_major step_minor label_minor bgn
## 11 fit.models 6 1 1 298.940
## 14 fit.data.training 7 0 0 659.988
## 5 extract.features 3 0 0 26.903
## 12 fit.models 6 2 2 570.369
## 10 fit.models 6 0 0 221.813
## 7 cluster.data 3 2 2 154.626
## 16 predict.data.new 8 0 0 927.165
## 9 select.features 5 0 0 194.566
## 1 import.data 1 0 0 9.734
## 13 fit.models 6 3 3 649.986
## 15 fit.data.training 7 1 1 918.427
## 2 inspect.data 2 0 0 20.676
## 3 scrub.data 2 1 1 25.488
## 8 partition.data.training 4 0 0 193.695
## 4 transform.data 2 2 2 26.509
## 6 manage.missing.data 3 1 1 154.462
## end elapsed duration
## 11 570.368 271.428 271.428
## 14 918.427 258.439 258.439
## 5 154.461 127.559 127.558
## 12 649.985 79.616 79.616
## 10 298.939 77.126 77.126
## 7 193.695 39.069 39.069
## 16 956.162 28.997 28.997
## 9 221.813 27.247 27.247
## 1 20.675 10.942 10.941
## 13 659.988 10.002 10.002
## 15 927.165 8.738 8.738
## 2 25.487 4.811 4.811
## 3 26.509 1.021 1.021
## 8 194.566 0.871 0.871
## 4 26.903 0.394 0.394
## 6 154.625 0.163 0.163
## [1] "Total Elapsed Time: 956.162 secs"
## label step_major step_minor
## 10 fit.models_1_RFE.X 2 8
## 15 fit.models_1_Max.cor.Y.rcv.1X1.Interact 4 1
## 9 fit.models_1_RFE.X 2 7
## 5 fit.models_1_RFE.X 2 3
## 13 fit.models_1_All.X 3 1
## 11 fit.models_1_RFE.X 2 9
## 7 fit.models_1_RFE.X 2 5
## 8 fit.models_1_RFE.X 2 6
## 4 fit.models_1_RFE.X 2 2
## 3 fit.models_1_RFE.X 2 1
## 6 fit.models_1_RFE.X 2 4
## 14 fit.models_1_Best.Interact 4 0
## 1 fit.models_1_bgn 1 0
## 2 fit.models_1_RFE.X 2 0
## 12 fit.models_1_All.X 3 0
## label_minor bgn end elapsed duration
## 10 avNNet 412.126 488.419 76.293 76.293
## 15 glmnet 527.715 570.359 42.644 42.644
## 9 nnet 379.334 412.125 32.791 32.791
## 5 glmnet 325.557 351.063 25.506 25.506
## 13 glmnet 502.445 527.686 25.241 25.241
## 11 earth 488.419 502.437 14.018 14.018
## 7 gbm 357.168 369.179 12.011 12.011
## 8 rf 369.179 379.333 10.155 10.154
## 4 bayesglm 316.255 325.557 9.302 9.302
## 3 glm 309.189 316.254 7.066 7.065
## 6 rpart 351.064 357.167 6.103 6.103
## 14 setup 527.687 527.714 0.027 0.027
## 1 setup 309.170 309.180 0.010 0.010
## 2 setup 309.180 309.188 0.008 0.008
## 12 setup 502.438 502.445 0.007 0.007
## [1] "Total Elapsed Time: 570.359 secs"